diff options
Diffstat (limited to 'arch/x86')
213 files changed, 4476 insertions, 5684 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8453fe1342e..7d5feb5908d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -43,6 +43,7 @@ config X86 select HAVE_DMA_ATTRS select HAVE_DMA_CONTIGUOUS if !SWIOTLB select HAVE_KRETPROBES + select GENERIC_EARLY_IOREMAP select HAVE_OPTPROBES select HAVE_KPROBES_ON_FTRACE select HAVE_FTRACE_MCOUNT_RECORD @@ -107,9 +108,9 @@ config X86 select HAVE_ARCH_SOFT_DIRTY select CLOCKSOURCE_WATCHDOG select GENERIC_CLOCKEVENTS - select ARCH_CLOCKSOURCE_DATA if X86_64 + select ARCH_CLOCKSOURCE_DATA select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) - select GENERIC_TIME_VSYSCALL if X86_64 + select GENERIC_TIME_VSYSCALL select KTIME_SCALAR if X86_32 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER @@ -127,6 +128,8 @@ config X86 select HAVE_DEBUG_STACKOVERFLOW select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 select HAVE_CC_STACKPROTECTOR + select GENERIC_CPU_AUTOPROBE + select HAVE_ARCH_AUDITSYSCALL config INSTRUCTION_DECODER def_bool y @@ -195,9 +198,6 @@ config ARCH_HAS_CPU_RELAX config ARCH_HAS_CACHE_LINE_SIZE def_bool y -config ARCH_HAS_CPU_AUTOPROBE - def_bool y - config HAVE_SETUP_PER_CPU_AREA def_bool y @@ -261,6 +261,9 @@ config ARCH_HWEIGHT_CFLAGS config ARCH_SUPPORTS_UPROBES def_bool y +config FIX_EARLYCON_MEM + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" @@ -346,12 +349,9 @@ config X86_EXTENDED_PLATFORM for the following (non-PC) 32 bit x86 platforms: Goldfish (Android emulator) AMD Elan - NUMAQ (IBM/Sequent) RDC R-321x SoC SGI 320/540 (Visual Workstation) STA2X11-based (e.g. Northville) - Summit/EXA (IBM x440) - Unisys ES7000 IA32 series Moorestown MID devices If you have one of these systems, or if you want to build a @@ -418,7 +418,6 @@ config X86_UV config X86_GOLDFISH bool "Goldfish (Virtual Platform)" - depends on X86_32 depends on X86_EXTENDED_PLATFORM ---help--- Enable support for the Goldfish virtual platform used primarily @@ -489,49 +488,22 @@ config X86_32_NON_STANDARD depends on X86_32 && SMP depends on X86_EXTENDED_PLATFORM ---help--- - This option compiles in the NUMAQ, Summit, bigsmp, ES7000, - STA2X11, default subarchitectures. It is intended for a generic - binary kernel. If you select them all, kernel will probe it - one by one and will fallback to default. + This option compiles in the bigsmp and STA2X11 default + subarchitectures. It is intended for a generic binary + kernel. If you select them all, kernel will probe it one by + one and will fallback to default. # Alphabetically sorted list of Non standard 32 bit platforms -config X86_NUMAQ - bool "NUMAQ (IBM/Sequent)" - depends on X86_32_NON_STANDARD - depends on PCI - select NUMA - select X86_MPPARSE - ---help--- - This option is used for getting Linux to run on a NUMAQ (IBM/Sequent) - NUMA multiquad box. This changes the way that processors are - bootstrapped, and uses Clustered Logical APIC addressing mode instead - of Flat Logical. You will need a new lynxer.elf file to flash your - firmware with - send email to <Martin.Bligh@us.ibm.com>. - config X86_SUPPORTS_MEMORY_FAILURE def_bool y # MCE code calls memory_failure(): depends on X86_MCE # On 32-bit this adds too big of NODES_SHIFT and we run out of page flags: - depends on !X86_NUMAQ # On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH: depends on X86_64 || !SPARSEMEM select ARCH_SUPPORTS_MEMORY_FAILURE -config X86_VISWS - bool "SGI 320/540 (Visual Workstation)" - depends on X86_32 && PCI && X86_MPPARSE && PCI_GODIRECT - depends on X86_32_NON_STANDARD - ---help--- - The SGI Visual Workstation series is an IA32-based workstation - based on SGI systems chips with some legacy PC hardware attached. - - Say Y here to create a kernel to run on the SGI 320 or 540. - - A kernel compiled for the Visual Workstation will run on general - PCs as well. See <file:Documentation/sgi-visws.txt> for details. - config STA2X11 bool "STA2X11 Companion Chip Support" depends on X86_32_NON_STANDARD && PCI @@ -548,20 +520,6 @@ config STA2X11 option is selected the kernel will still be able to boot on standard PC machines. -config X86_SUMMIT - bool "Summit/EXA (IBM x440)" - depends on X86_32_NON_STANDARD - ---help--- - This option is needed for IBM systems that use the Summit/EXA chipset. - In particular, it is needed for the x440. - -config X86_ES7000 - bool "Unisys ES7000 IA32 series" - depends on X86_32_NON_STANDARD && X86_BIGSMP - ---help--- - Support for Unisys ES7000 systems. Say 'Y' here if this kernel is - supposed to run on an IA32-based Unisys ES7000 system. - config X86_32_IRIS tristate "Eurobraille/Iris poweroff module" depends on X86_32 @@ -684,14 +642,6 @@ config MEMTEST memtest=4, mean do 4 test patterns. If you are unsure how to answer this question, answer N. -config X86_SUMMIT_NUMA - def_bool y - depends on X86_32 && NUMA && X86_32_NON_STANDARD - -config X86_CYCLONE_TIMER - def_bool y - depends on X86_SUMMIT - source "arch/x86/Kconfig.cpu" config HPET_TIMER @@ -820,7 +770,7 @@ config NR_CPUS range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64 default "1" if !SMP default "8192" if MAXSMP - default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000) + default "32" if SMP && X86_BIGSMP default "8" if SMP ---help--- This allows you to specify the maximum number of CPUs which this @@ -884,10 +834,6 @@ config X86_IO_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI -config X86_VISWS_APIC - def_bool y - depends on X86_32 && X86_VISWS - config X86_REROUTE_FOR_BROKEN_BOOT_IRQS bool "Reroute for broken boot IRQs" depends on X86_IO_APIC @@ -1105,13 +1051,11 @@ config X86_CPUID choice prompt "High Memory Support" - default HIGHMEM64G if X86_NUMAQ default HIGHMEM4G depends on X86_32 config NOHIGHMEM bool "off" - depends on !X86_NUMAQ ---help--- Linux can use up to 64 Gigabytes of physical memory on x86 systems. However, the address space of 32-bit x86 processors is only 4 @@ -1148,7 +1092,6 @@ config NOHIGHMEM config HIGHMEM4G bool "4GB" - depends on !X86_NUMAQ ---help--- Select this if you have a 32-bit processor and between 1 and 4 gigabytes of physical RAM. @@ -1240,8 +1183,8 @@ config DIRECT_GBPAGES config NUMA bool "Numa Memory Allocation and Scheduler Support" depends on SMP - depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI)) - default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) + depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP) + default y if X86_BIGSMP ---help--- Enable NUMA (Non Uniform Memory Access) support. @@ -1252,15 +1195,11 @@ config NUMA For 64-bit this is recommended if the system is Intel Core i7 (or later), AMD Opteron, or EM64T NUMA. - For 32-bit this is only needed on (rare) 32-bit-only platforms - that support NUMA topologies, such as NUMAQ / Summit, or if you - boot a 32-bit kernel on a 64-bit NUMA platform. + For 32-bit this is only needed if you boot a 32-bit + kernel on a 64-bit NUMA platform. Otherwise, you should say N. -comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" - depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) - config AMD_NUMA def_bool y prompt "Old style AMD Opteron NUMA detection" @@ -1302,7 +1241,6 @@ config NODES_SHIFT range 1 10 default "10" if MAXSMP default "6" if X86_64 - default "4" if X86_NUMAQ default "3" depends on NEED_MULTIPLE_NODES ---help--- @@ -1850,17 +1788,29 @@ config DEBUG_HOTPLUG_CPU0 If unsure, say N. config COMPAT_VDSO - def_bool y - prompt "Compat VDSO support" + def_bool n + prompt "Disable the 32-bit vDSO (needed for glibc 2.3.3)" depends on X86_32 || IA32_EMULATION ---help--- - Map the 32-bit VDSO to the predictable old-style address too. + Certain buggy versions of glibc will crash if they are + presented with a 32-bit vDSO that is not mapped at the address + indicated in its segment table. - Say N here if you are running a sufficiently recent glibc - version (2.3.3 or later), to remove the high-mapped - VDSO mapping and to exclusively use the randomized VDSO. + The bug was introduced by f866314b89d56845f55e6f365e18b31ec978ec3a + and fixed by 3b3ddb4f7db98ec9e912ccdf54d35df4aa30e04a and + 49ad572a70b8aeb91e57483a11dd1b77e31c4468. Glibc 2.3.3 is + the only released version with the bug, but OpenSUSE 9 + contains a buggy "glibc 2.3.2". - If unsure, say Y. + The symptom of the bug is that everything crashes on startup, saying: + dl_main: Assertion `(void *) ph->p_vaddr == _rtld_local._dl_sysinfo_dso' failed! + + Saying Y here changes the default value of the vdso32 boot + option from 1 to 0, which turns off the 32-bit vDSO entirely. + This works around the glibc bug but hurts performance. + + If unsure, say N: if you are compiling your own kernel, you + are unlikely to be using a buggy version of glibc. config CMDLINE_BOOL bool "Built-in kernel command line" @@ -2427,12 +2377,9 @@ config X86_DMA_REMAP depends on STA2X11 config IOSF_MBI - bool + tristate + default m depends on PCI - ---help--- - To be selected by modules requiring access to the Intel OnChip System - Fabric (IOSF) Sideband MailBox Interface (MBI). For MBI platforms - enumerable by PCI. source "net/Kconfig" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index f3aaf231b4e..6983314c8b3 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -359,7 +359,7 @@ config X86_P6_NOP config X86_TSC def_bool y - depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) && !X86_NUMAQ) || X86_64 + depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 config X86_CMPXCHG64 def_bool y diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 3b9348a0c1a..33f71b01fd2 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -79,11 +79,14 @@ else UTS_MACHINE := x86_64 CHECKFLAGS += -D__x86_64__ -m64 + biarch := -m64 KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 # Don't autogenerate traditional x87, MMX or SSE instructions - KBUILD_CFLAGS += -mno-mmx -mno-sse -mno-80387 -mno-fp-ret-in-387 + KBUILD_CFLAGS += -mno-mmx -mno-sse + KBUILD_CFLAGS += $(call cc-option,-mno-80387) + KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) # Use -mpreferred-stack-boundary=3 if supported. KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3) @@ -108,7 +111,7 @@ else # this works around some issues with generating unwind tables in older gccs # newer gccs do it by default - KBUILD_CFLAGS += -maccumulate-outgoing-args + KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args) endif # Make sure compiler does not have buggy stack-protector support. @@ -250,8 +253,8 @@ archclean: PHONY += kvmconfig kvmconfig: $(if $(wildcard $(objtree)/.config),, $(error You need an existing .config for this target)) - $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m -O $(objtree) $(objtree)/.config arch/x86/configs/kvm_guest.config - $(Q)yes "" | $(MAKE) oldconfig + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m -O $(objtree) $(objtree)/.config $(srctree)/arch/x86/configs/kvm_guest.config + $(Q)yes "" | $(MAKE) -f $(srctree)/Makefile oldconfig define archhelp echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index abb9eba61b5..dbe8dd2fe24 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -71,7 +71,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 0x\1/p' quiet_cmd_voffset = VOFFSET $@ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ @@ -80,7 +80,7 @@ targets += voffset.h $(obj)/voffset.h: vmlinux FORCE $(call if_changed,voffset) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 50f8c5e0f37..bd49ec61255 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -177,14 +177,6 @@ static inline void wrgs32(u32 v, addr_t addr) } /* Note: these only return true/false, not a signed return value! */ -static inline int memcmp(const void *s1, const void *s2, size_t len) -{ - u8 diff; - asm("repe; cmpsb; setnz %0" - : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); - return diff; -} - static inline int memcmp_fs(const void *s1, addr_t s2, size_t len) { u8 diff; @@ -228,11 +220,6 @@ void copy_to_fs(addr_t dst, void *src, size_t len); void *copy_from_fs(void *dst, addr_t src, size_t len); void copy_to_gs(addr_t dst, void *src, size_t len); void *copy_from_gs(void *dst, addr_t src, size_t len); -void *memcpy(void *dst, void *src, size_t len); -void *memset(void *dst, int c, size_t len); - -#define memcpy(d,s,l) __builtin_memcpy(d,s,l) -#define memset(d,c,l) __builtin_memset(d,c,l) /* a20.c */ int enable_a20(void); diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 1e6146137f8..4703a6c4b8e 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -112,7 +112,7 @@ __file_size64(void *__fh, efi_char16_t *filename_16, efi_file_info_t *info; efi_status_t status; efi_guid_t info_guid = EFI_FILE_INFO_ID; - u32 info_sz; + u64 info_sz; status = efi_early->call((unsigned long)fh->open, fh, &h, filename_16, EFI_FILE_MODE_READ, (u64)0); @@ -167,31 +167,31 @@ efi_file_size(efi_system_table_t *sys_table, void *__fh, } static inline efi_status_t -efi_file_read(void *__fh, void *handle, unsigned long *size, void *addr) +efi_file_read(void *handle, unsigned long *size, void *addr) { unsigned long func; if (efi_early->is64) { - efi_file_handle_64_t *fh = __fh; + efi_file_handle_64_t *fh = handle; func = (unsigned long)fh->read; return efi_early->call(func, handle, size, addr); } else { - efi_file_handle_32_t *fh = __fh; + efi_file_handle_32_t *fh = handle; func = (unsigned long)fh->read; return efi_early->call(func, handle, size, addr); } } -static inline efi_status_t efi_file_close(void *__fh, void *handle) +static inline efi_status_t efi_file_close(void *handle) { if (efi_early->is64) { - efi_file_handle_64_t *fh = __fh; + efi_file_handle_64_t *fh = handle; return efi_early->call((unsigned long)fh->close, handle); } else { - efi_file_handle_32_t *fh = __fh; + efi_file_handle_32_t *fh = handle; return efi_early->call((unsigned long)fh->close, handle); } @@ -1016,6 +1016,9 @@ void setup_graphics(struct boot_params *boot_params) * Because the x86 boot code expects to be passed a boot_params we * need to create one ourselves (usually the bootloader would create * one for us). + * + * The caller is responsible for filling out ->code32_start in the + * returned boot_params. */ struct boot_params *make_boot_params(struct efi_config *c) { @@ -1081,8 +1084,6 @@ struct boot_params *make_boot_params(struct efi_config *c) hdr->vid_mode = 0xffff; hdr->boot_flag = 0xAA55; - hdr->code32_start = (__u64)(unsigned long)image->image_base; - hdr->type_of_loader = 0x21; /* Convert unicode cmdline to ascii */ diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index de9d4200d30..cbed1407a5c 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -59,6 +59,7 @@ ENTRY(efi_pe_entry) call make_boot_params cmpl $0, %eax je fail + movl %esi, BP_code32_start(%eax) popl %ecx pushl %eax pushl %ecx @@ -90,12 +91,7 @@ fail: hlt jmp fail 2: - call 3f -3: - popl %eax - subl $3b, %eax - subl BP_pref_address(%esi), %eax - add BP_code32_start(%esi), %eax + movl BP_code32_start(%esi), %eax leal preferred_addr(%eax), %eax jmp *%eax diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 57e58a5fa21..0d558ee899a 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -261,6 +261,8 @@ ENTRY(efi_pe_entry) cmpq $0,%rax je fail mov %rax, %rsi + leaq startup_32(%rip), %rax + movl %eax, BP_code32_start(%rsi) jmp 2f /* Skip the relocation */ handover_entry: @@ -284,12 +286,7 @@ fail: hlt jmp fail 2: - call 3f -3: - popq %rax - subq $3b, %rax - subq BP_pref_address(%rsi), %rax - add BP_code32_start(%esi), %eax + movl BP_code32_start(%esi), %eax leaq preferred_addr(%rax), %rax jmp *%rax diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 196eaf373a0..57ab74df7ee 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -10,6 +10,7 @@ */ #include "misc.h" +#include "../string.h" /* WARNING!! * This code is compiled with -fPIC and it is relocated dynamically @@ -97,8 +98,14 @@ */ #define STATIC static -#undef memset #undef memcpy + +/* + * Use a normal definition of memset() from string.c. There are already + * included header files which expect a definition of memset() and by + * the time we define memset macro, it is too late. + */ +#undef memset #define memzero(s, n) memset((s), 0, (n)) @@ -109,9 +116,6 @@ static void error(char *m); */ struct boot_params *real_mode; /* Pointer to real-mode data */ -void *memset(void *s, int c, size_t n); -void *memcpy(void *dest, const void *src, size_t n); - memptr free_mem_ptr; memptr free_mem_end_ptr; @@ -216,45 +220,6 @@ void __putstr(const char *s) outb(0xff & (pos >> 1), vidport+1); } -void *memset(void *s, int c, size_t n) -{ - int i; - char *ss = s; - - for (i = 0; i < n; i++) - ss[i] = c; - return s; -} -#ifdef CONFIG_X86_32 -void *memcpy(void *dest, const void *src, size_t n) -{ - int d0, d1, d2; - asm volatile( - "rep ; movsl\n\t" - "movl %4,%%ecx\n\t" - "rep ; movsb\n\t" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src) - : "memory"); - - return dest; -} -#else -void *memcpy(void *dest, const void *src, size_t n) -{ - long d0, d1, d2; - asm volatile( - "rep ; movsq\n\t" - "movq %4,%%rcx\n\t" - "rep ; movsb\n\t" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src) - : "memory"); - - return dest; -} -#endif - static void error(char *x) { error_putstr("\n\n"); @@ -389,7 +354,7 @@ static void parse_elf(void *output) free(phdrs); } -asmlinkage void *decompress_kernel(void *rmode, memptr heap, +asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, unsigned char *output, diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index ffb9c5c9d74..00e788be1db 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,11 +1,41 @@ -#include "misc.h" +#include "../string.c" -int memcmp(const void *s1, const void *s2, size_t len) +#ifdef CONFIG_X86_32 +void *memcpy(void *dest, const void *src, size_t n) { - u8 diff; - asm("repe; cmpsb; setnz %0" - : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); - return diff; + int d0, d1, d2; + asm volatile( + "rep ; movsl\n\t" + "movl %4,%%ecx\n\t" + "rep ; movsb\n\t" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src) + : "memory"); + + return dest; } +#else +void *memcpy(void *dest, const void *src, size_t n) +{ + long d0, d1, d2; + asm volatile( + "rep ; movsq\n\t" + "movq %4,%%rcx\n\t" + "rep ; movsb\n\t" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src) + : "memory"); -#include "../string.c" + return dest; +} +#endif + +void *memset(void *s, int c, size_t n) +{ + int i; + char *ss = s; + + for (i = 0; i < n; i++) + ss[i] = c; + return s; +} diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index f0d0b20fe14..1fd7d575092 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -27,6 +27,7 @@ #include <asm/processor-flags.h> #include <asm/required-features.h> #include <asm/msr-index.h> +#include "string.h" static u32 err_flags[NCAPINTS]; diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index c501a5b466f..223e4252707 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -15,6 +15,7 @@ #include "boot.h" #include <linux/edd.h> +#include "string.h" #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index cf6083d444f..fd6c9f23699 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -14,6 +14,7 @@ */ #include "boot.h" +#include "string.h" struct boot_params boot_params __attribute__((aligned(16))); diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c index 958019b1cfa..c0fb356a309 100644 --- a/arch/x86/boot/regs.c +++ b/arch/x86/boot/regs.c @@ -17,6 +17,7 @@ */ #include "boot.h" +#include "string.h" void initregs(struct biosregs *reg) { diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 574dedfe289..493f3fd9f13 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -12,7 +12,16 @@ * Very basic string functions */ -#include "boot.h" +#include <linux/types.h> +#include "ctype.h" + +int memcmp(const void *s1, const void *s2, size_t len) +{ + u8 diff; + asm("repe; cmpsb; setnz %0" + : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); + return diff; +} int strcmp(const char *str1, const char *str2) { diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h new file mode 100644 index 00000000000..725e820602b --- /dev/null +++ b/arch/x86/boot/string.h @@ -0,0 +1,21 @@ +#ifndef BOOT_STRING_H +#define BOOT_STRING_H + +/* Undef any of these macros coming from string_32.h. */ +#undef memcpy +#undef memset +#undef memcmp + +void *memcpy(void *dst, const void *src, size_t len); +void *memset(void *dst, int c, size_t len); +int memcmp(const void *s1, const void *s2, size_t len); + +/* + * Access builtin version by default. If one needs to use optimized version, + * do "undef memcpy" in .c file and link against right string.c + */ +#define memcpy(d,s,l) __builtin_memcpy(d,s,l) +#define memset(d,c,l) __builtin_memset(d,c,l) +#define memcmp __builtin_memcmp + +#endif /* BOOT_STRING_H */ diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 11e8c6eb80a..ba3e100654d 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -16,6 +16,7 @@ #include "boot.h" #include "video.h" #include "vesa.h" +#include "string.h" /* VESA information */ static struct vesa_general_info vginfo; diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index a7fef2621cc..32d2e7056c8 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -60,7 +60,6 @@ CONFIG_CRASH_DUMP=y CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y CONFIG_PM_TRACE_RTC=y -CONFIG_ACPI_PROCFS=y CONFIG_ACPI_DOCK=y CONFIG_CPU_FREQ=y # CONFIG_CPU_FREQ_STAT is not set @@ -245,7 +244,6 @@ CONFIG_HID_TOPSEED=y CONFIG_HID_PID=y CONFIG_USB_HIDDEV=y CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y CONFIG_USB_EHCI_HCD=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index c1119d4c128..a481dd4755d 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -58,7 +58,6 @@ CONFIG_CRASH_DUMP=y CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y CONFIG_PM_TRACE_RTC=y -CONFIG_ACPI_PROCFS=y CONFIG_ACPI_DOCK=y CONFIG_CPU_FREQ=y # CONFIG_CPU_FREQ_STAT is not set @@ -240,7 +239,6 @@ CONFIG_HID_TOPSEED=y CONFIG_HID_PID=y CONFIG_USB_HIDDEV=y CONFIG_USB=y -CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y CONFIG_USB_EHCI_HCD=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 6ba54d64038..61d6e281898 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -79,6 +79,9 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o +ifeq ($(avx2_supported),yes) +sha1-ssse3-y += sha1_avx2_x86_64_asm.o +endif crc32c-intel-y := crc32c-intel_glue.o crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 50ec333b70e..8af519ed73d 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -223,9 +223,6 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, src -= 1; dst -= 1; } while (nbytes >= bsize * 4); - - if (nbytes < bsize) - goto done; } /* Handle leftovers */ diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index e6a3700489b..e57e20ab5e0 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -203,9 +203,6 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, src -= 1; dst -= 1; } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; } /* Handle leftovers */ diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 586f41aac36..185fad49d86 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -24,10 +24,6 @@ .align 16 .Lbswap_mask: .octa 0x000102030405060708090a0b0c0d0e0f -.Lpoly: - .octa 0xc2000000000000000000000000000001 -.Ltwo_one: - .octa 0x00000001000000000000000000000001 #define DATA %xmm0 #define SHASH %xmm1 @@ -134,28 +130,3 @@ ENTRY(clmul_ghash_update) .Lupdate_just_ret: ret ENDPROC(clmul_ghash_update) - -/* - * void clmul_ghash_setkey(be128 *shash, const u8 *key); - * - * Calculate hash_key << 1 mod poly - */ -ENTRY(clmul_ghash_setkey) - movaps .Lbswap_mask, BSWAP - movups (%rsi), %xmm0 - PSHUFB_XMM BSWAP %xmm0 - movaps %xmm0, %xmm1 - psllq $1, %xmm0 - psrlq $63, %xmm1 - movaps %xmm1, %xmm2 - pslldq $8, %xmm1 - psrldq $8, %xmm2 - por %xmm1, %xmm0 - # reduction - pshufd $0b00100100, %xmm2, %xmm1 - pcmpeqd .Ltwo_one, %xmm1 - pand .Lpoly, %xmm1 - pxor %xmm1, %xmm0 - movups %xmm0, (%rdi) - ret -ENDPROC(clmul_ghash_setkey) diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 6759dd1135b..d785cf2c529 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -30,8 +30,6 @@ void clmul_ghash_mul(char *dst, const be128 *shash); void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, const be128 *shash); -void clmul_ghash_setkey(be128 *shash, const u8 *key); - struct ghash_async_ctx { struct cryptd_ahash *cryptd_tfm; }; @@ -58,13 +56,23 @@ static int ghash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct ghash_ctx *ctx = crypto_shash_ctx(tfm); + be128 *x = (be128 *)key; + u64 a, b; if (keylen != GHASH_BLOCK_SIZE) { crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } - clmul_ghash_setkey(&ctx->shash, key); + /* perform multiplication by 'x' in GF(2^128) */ + a = be64_to_cpu(x->a); + b = be64_to_cpu(x->b); + + ctx->shash.a = (__be64)((b << 1) | (a >> 63)); + ctx->shash.b = (__be64)((a << 1) | (b >> 63)); + + if (a >> 63) + ctx->shash.b ^= cpu_to_be64(0xc2); return 0; } diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S new file mode 100644 index 00000000000..1cd792db15e --- /dev/null +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -0,0 +1,708 @@ +/* + * Implement fast SHA-1 with AVX2 instructions. (x86_64) + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Ilya Albrekht <ilya.albrekht@intel.com> + * Maxim Locktyukhin <maxim.locktyukhin@intel.com> + * Ronen Zohar <ronen.zohar@intel.com> + * Chandramouli Narayanan <mouli@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. + * + *This implementation is based on the previous SSSE3 release: + *Visit http://software.intel.com/en-us/articles/ + *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ + * + *Updates 20-byte SHA-1 record in 'hash' for even number of + *'num_blocks' consecutive 64-byte blocks + * + *extern "C" void sha1_transform_avx2( + * int *hash, const char* input, size_t num_blocks ); + */ + +#include <linux/linkage.h> + +#define CTX %rdi /* arg1 */ +#define BUF %rsi /* arg2 */ +#define CNT %rdx /* arg3 */ + +#define REG_A %ecx +#define REG_B %esi +#define REG_C %edi +#define REG_D %eax +#define REG_E %edx +#define REG_TB %ebx +#define REG_TA %r12d +#define REG_RA %rcx +#define REG_RB %rsi +#define REG_RC %rdi +#define REG_RD %rax +#define REG_RE %rdx +#define REG_RTA %r12 +#define REG_RTB %rbx +#define REG_T1 %ebp +#define xmm_mov vmovups +#define avx2_zeroupper vzeroupper +#define RND_F1 1 +#define RND_F2 2 +#define RND_F3 3 + +.macro REGALLOC + .set A, REG_A + .set B, REG_B + .set C, REG_C + .set D, REG_D + .set E, REG_E + .set TB, REG_TB + .set TA, REG_TA + + .set RA, REG_RA + .set RB, REG_RB + .set RC, REG_RC + .set RD, REG_RD + .set RE, REG_RE + + .set RTA, REG_RTA + .set RTB, REG_RTB + + .set T1, REG_T1 +.endm + +#define K_BASE %r8 +#define HASH_PTR %r9 +#define BUFFER_PTR %r10 +#define BUFFER_PTR2 %r13 +#define BUFFER_END %r11 + +#define PRECALC_BUF %r14 +#define WK_BUF %r15 + +#define W_TMP %xmm0 +#define WY_TMP %ymm0 +#define WY_TMP2 %ymm9 + +# AVX2 variables +#define WY0 %ymm3 +#define WY4 %ymm5 +#define WY08 %ymm7 +#define WY12 %ymm8 +#define WY16 %ymm12 +#define WY20 %ymm13 +#define WY24 %ymm14 +#define WY28 %ymm15 + +#define YMM_SHUFB_BSWAP %ymm10 + +/* + * Keep 2 iterations precalculated at a time: + * - 80 DWORDs per iteration * 2 + */ +#define W_SIZE (80*2*2 +16) + +#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) +#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) + + +.macro UPDATE_HASH hash, val + add \hash, \val + mov \val, \hash +.endm + +.macro PRECALC_RESET_WY + .set WY_00, WY0 + .set WY_04, WY4 + .set WY_08, WY08 + .set WY_12, WY12 + .set WY_16, WY16 + .set WY_20, WY20 + .set WY_24, WY24 + .set WY_28, WY28 + .set WY_32, WY_00 +.endm + +.macro PRECALC_ROTATE_WY + /* Rotate macros */ + .set WY_32, WY_28 + .set WY_28, WY_24 + .set WY_24, WY_20 + .set WY_20, WY_16 + .set WY_16, WY_12 + .set WY_12, WY_08 + .set WY_08, WY_04 + .set WY_04, WY_00 + .set WY_00, WY_32 + + /* Define register aliases */ + .set WY, WY_00 + .set WY_minus_04, WY_04 + .set WY_minus_08, WY_08 + .set WY_minus_12, WY_12 + .set WY_minus_16, WY_16 + .set WY_minus_20, WY_20 + .set WY_minus_24, WY_24 + .set WY_minus_28, WY_28 + .set WY_minus_32, WY +.endm + +.macro PRECALC_00_15 + .if (i == 0) # Initialize and rotate registers + PRECALC_RESET_WY + PRECALC_ROTATE_WY + .endif + + /* message scheduling pre-compute for rounds 0-15 */ + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP + .elseif ((i & 7) == 1) + vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ + WY_TMP, WY_TMP + .elseif ((i & 7) == 2) + vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY + .elseif ((i & 7) == 4) + vpaddd K_XMM(K_BASE), WY, WY_TMP + .elseif ((i & 7) == 7) + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC_16_31 + /* + * message scheduling pre-compute for rounds 16-31 + * calculating last 32 w[i] values in 8 XMM registers + * pre-calculate K+w[i] values and store to mem + * for later load by ALU add instruction + * + * "brute force" vectorization for rounds 16-31 only + * due to w[i]->w[i-3] dependency + */ + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + /* w[i-14] */ + vpalignr $8, WY_minus_16, WY_minus_12, WY + vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ + .elseif ((i & 7) == 1) + vpxor WY_minus_08, WY, WY + vpxor WY_minus_16, WY_TMP, WY_TMP + .elseif ((i & 7) == 2) + vpxor WY_TMP, WY, WY + vpslldq $12, WY, WY_TMP2 + .elseif ((i & 7) == 3) + vpslld $1, WY, WY_TMP + vpsrld $31, WY, WY + .elseif ((i & 7) == 4) + vpor WY, WY_TMP, WY_TMP + vpslld $2, WY_TMP2, WY + .elseif ((i & 7) == 5) + vpsrld $30, WY_TMP2, WY_TMP2 + vpxor WY, WY_TMP, WY_TMP + .elseif ((i & 7) == 7) + vpxor WY_TMP2, WY_TMP, WY + vpaddd K_XMM(K_BASE), WY, WY_TMP + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC_32_79 + /* + * in SHA-1 specification: + * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + * instead we do equal: + * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + * allows more efficient vectorization + * since w[i]=>w[i-3] dependency is broken + */ + + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP + .elseif ((i & 7) == 1) + /* W is W_minus_32 before xor */ + vpxor WY_minus_28, WY, WY + .elseif ((i & 7) == 2) + vpxor WY_minus_16, WY_TMP, WY_TMP + .elseif ((i & 7) == 3) + vpxor WY_TMP, WY, WY + .elseif ((i & 7) == 4) + vpslld $2, WY, WY_TMP + .elseif ((i & 7) == 5) + vpsrld $30, WY, WY + vpor WY, WY_TMP, WY + .elseif ((i & 7) == 7) + vpaddd K_XMM(K_BASE), WY, WY_TMP + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC r, s + .set i, \r + + .if (i < 40) + .set K_XMM, 32*0 + .elseif (i < 80) + .set K_XMM, 32*1 + .elseif (i < 120) + .set K_XMM, 32*2 + .else + .set K_XMM, 32*3 + .endif + + .if (i<32) + PRECALC_00_15 \s + .elseif (i<64) + PRECALC_16_31 \s + .elseif (i < 160) + PRECALC_32_79 \s + .endif +.endm + +.macro ROTATE_STATE + .set T_REG, E + .set E, D + .set D, C + .set C, B + .set B, TB + .set TB, A + .set A, T_REG + + .set T_REG, RE + .set RE, RD + .set RD, RC + .set RC, RB + .set RB, RTB + .set RTB, RA + .set RA, T_REG +.endm + +/* Macro relies on saved ROUND_Fx */ + +.macro RND_FUN f, r + .if (\f == RND_F1) + ROUND_F1 \r + .elseif (\f == RND_F2) + ROUND_F2 \r + .elseif (\f == RND_F3) + ROUND_F3 \r + .endif +.endm + +.macro RR r + .set round_id, (\r % 80) + + .if (round_id == 0) /* Precalculate F for first round */ + .set ROUND_FUNC, RND_F1 + mov B, TB + + rorx $(32-30), B, B /* b>>>2 */ + andn D, TB, T1 + and C, TB + xor T1, TB + .endif + + RND_FUN ROUND_FUNC, \r + ROTATE_STATE + + .if (round_id == 18) + .set ROUND_FUNC, RND_F2 + .elseif (round_id == 38) + .set ROUND_FUNC, RND_F3 + .elseif (round_id == 58) + .set ROUND_FUNC, RND_F2 + .endif + + .set round_id, ( (\r+1) % 80) + + RND_FUN ROUND_FUNC, (\r+1) + ROTATE_STATE +.endm + +.macro ROUND_F1 r + add WK(\r), E + + andn C, A, T1 /* ~b&d */ + lea (RE,RTB), E /* Add F from the previous round */ + + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + rorx $(32-30),A, TB /* b>>>2 for next round */ + + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + /* + * Calculate F for the next round + * (b & c) ^ andn[b, d] + */ + and B, A /* b&c */ + xor T1, A /* F1 = (b&c) ^ (~b&d) */ + + lea (RE,RTA), E /* E += A >>> 5 */ +.endm + +.macro ROUND_F2 r + add WK(\r), E + lea (RE,RTB), E /* Add F from the previous round */ + + /* Calculate F for the next round */ + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + .if ((round_id) < 79) + rorx $(32-30), A, TB /* b>>>2 for next round */ + .endif + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + .if ((round_id) < 79) + xor B, A + .endif + + add TA, E /* E += A >>> 5 */ + + .if ((round_id) < 79) + xor C, A + .endif +.endm + +.macro ROUND_F3 r + add WK(\r), E + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + lea (RE,RTB), E /* Add F from the previous round */ + + mov B, T1 + or A, T1 + + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + rorx $(32-30), A, TB /* b>>>2 for next round */ + + /* Calculate F for the next round + * (b and c) or (d and (b or c)) + */ + and C, T1 + and B, A + or T1, A + + add TA, E /* E += A >>> 5 */ + +.endm + +/* + * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining + */ +.macro SHA1_PIPELINED_MAIN_BODY + + REGALLOC + + mov (HASH_PTR), A + mov 4(HASH_PTR), B + mov 8(HASH_PTR), C + mov 12(HASH_PTR), D + mov 16(HASH_PTR), E + + mov %rsp, PRECALC_BUF + lea (2*4*80+32)(%rsp), WK_BUF + + # Precalc WK for first 2 blocks + PRECALC_OFFSET = 0 + .set i, 0 + .rept 160 + PRECALC i + .set i, i + 1 + .endr + PRECALC_OFFSET = 128 + xchg WK_BUF, PRECALC_BUF + + .align 32 +_loop: + /* + * code loops through more than one block + * we use K_BASE value as a signal of a last block, + * it is set below by: cmovae BUFFER_PTR, K_BASE + */ + cmp K_BASE, BUFFER_PTR + jne _begin + .align 32 + jmp _end + .align 32 +_begin: + + /* + * Do first block + * rounds: 0,2,4,6,8 + */ + .set j, 0 + .rept 5 + RR j + .set j, j+2 + .endr + + jmp _loop0 +_loop0: + + /* + * rounds: + * 10,12,14,16,18 + * 20,22,24,26,28 + * 30,32,34,36,38 + * 40,42,44,46,48 + * 50,52,54,56,58 + */ + .rept 25 + RR j + .set j, j+2 + .endr + + add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ + cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ + cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ + + /* + * rounds + * 60,62,64,66,68 + * 70,72,74,76,78 + */ + .rept 10 + RR j + .set j, j+2 + .endr + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), TB + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + cmp K_BASE, BUFFER_PTR /* is current block the last one? */ + je _loop + + mov TB, B + + /* Process second block */ + /* + * rounds + * 0+80, 2+80, 4+80, 6+80, 8+80 + * 10+80,12+80,14+80,16+80,18+80 + */ + + .set j, 0 + .rept 10 + RR j+80 + .set j, j+2 + .endr + + jmp _loop1 +_loop1: + /* + * rounds + * 20+80,22+80,24+80,26+80,28+80 + * 30+80,32+80,34+80,36+80,38+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + jmp _loop2 +_loop2: + + /* + * rounds + * 40+80,42+80,44+80,46+80,48+80 + * 50+80,52+80,54+80,56+80,58+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ + + cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ + cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ + + jmp _loop3 +_loop3: + + /* + * rounds + * 60+80,62+80,64+80,66+80,68+80 + * 70+80,72+80,74+80,76+80,78+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), TB + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + /* Reset state for AVX2 reg permutation */ + mov A, TA + mov TB, A + mov C, TB + mov E, C + mov D, B + mov TA, D + + REGALLOC + + xchg WK_BUF, PRECALC_BUF + + jmp _loop + + .align 32 + _end: + +.endm +/* + * macro implements SHA-1 function's body for several 64-byte blocks + * param: function's name + */ +.macro SHA1_VECTOR_ASM name + ENTRY(\name) + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + RESERVE_STACK = (W_SIZE*4 + 8+24) + + /* Align stack */ + mov %rsp, %rbx + and $~(0x20-1), %rsp + push %rbx + sub $RESERVE_STACK, %rsp + + avx2_zeroupper + + lea K_XMM_AR(%rip), K_BASE + + mov CTX, HASH_PTR + mov BUF, BUFFER_PTR + lea 64(BUF), BUFFER_PTR2 + + shl $6, CNT /* mul by 64 */ + add BUF, CNT + add $64, CNT + mov CNT, BUFFER_END + + cmp BUFFER_END, BUFFER_PTR2 + cmovae K_BASE, BUFFER_PTR2 + + xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP + + SHA1_PIPELINED_MAIN_BODY + + avx2_zeroupper + + add $RESERVE_STACK, %rsp + pop %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + + ENDPROC(\name) +.endm + +.section .rodata + +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +.align 128 +K_XMM_AR: + .long K1, K1, K1, K1 + .long K1, K1, K1, K1 + .long K2, K2, K2, K2 + .long K2, K2, K2, K2 + .long K3, K3, K3, K3 + .long K3, K3, K3, K3 + .long K4, K4, K4, K4 + .long K4, K4, K4, K4 + +BSWAP_SHUFB_CTL: + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f +.text + +SHA1_VECTOR_ASM sha1_transform_avx2 diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 4a11a9d7245..74d16ef707c 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -10,6 +10,7 @@ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> * Copyright (c) Mathias Krause <minipli@googlemail.com> + * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -39,6 +40,12 @@ asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, asmlinkage void sha1_transform_avx(u32 *digest, const char *data, unsigned int rounds); #endif +#ifdef CONFIG_AS_AVX2 +#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ + +asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, + unsigned int rounds); +#endif static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int); @@ -165,6 +172,18 @@ static int sha1_ssse3_import(struct shash_desc *desc, const void *in) return 0; } +#ifdef CONFIG_AS_AVX2 +static void sha1_apply_transform_avx2(u32 *digest, const char *data, + unsigned int rounds) +{ + /* Select the optimal transform based on data block size */ + if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE) + sha1_transform_avx2(digest, data, rounds); + else + sha1_transform_avx(digest, data, rounds); +} +#endif + static struct shash_alg alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_ssse3_init, @@ -201,27 +220,49 @@ static bool __init avx_usable(void) return true; } + +#ifdef CONFIG_AS_AVX2 +static bool __init avx2_usable(void) +{ + if (avx_usable() && cpu_has_avx2 && boot_cpu_has(X86_FEATURE_BMI1) && + boot_cpu_has(X86_FEATURE_BMI2)) + return true; + + return false; +} +#endif #endif static int __init sha1_ssse3_mod_init(void) { + char *algo_name; + /* test for SSSE3 first */ - if (cpu_has_ssse3) + if (cpu_has_ssse3) { sha1_transform_asm = sha1_transform_ssse3; + algo_name = "SSSE3"; + } #ifdef CONFIG_AS_AVX /* allow AVX to override SSSE3, it's a little faster */ - if (avx_usable()) + if (avx_usable()) { sha1_transform_asm = sha1_transform_avx; + algo_name = "AVX"; +#ifdef CONFIG_AS_AVX2 + /* allow AVX2 to override AVX, it's a little faster */ + if (avx2_usable()) { + sha1_transform_asm = sha1_apply_transform_avx2; + algo_name = "AVX2"; + } +#endif + } #endif if (sha1_transform_asm) { - pr_info("Using %s optimized SHA-1 implementation\n", - sha1_transform_asm == sha1_transform_ssse3 ? "SSSE3" - : "AVX"); + pr_info("Using %s optimized SHA-1 implementation\n", algo_name); return crypto_register_shash(&alg); } - pr_info("Neither AVX nor SSSE3 is available/usable.\n"); + pr_info("Neither AVX nor AVX2 nor SSSE3 is available/usable.\n"); return -ENODEV; } diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index a8fee078b92..3ca9762e164 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -5,4 +5,6 @@ genhdr-y += unistd_64.h genhdr-y += unistd_x32.h generic-y += clkdev.h +generic-y += early_ioremap.h +generic-y += cputime.h generic-y += mcs_spinlock.h diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index e6a92455740..69f1366f1aa 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -1,7 +1,7 @@ /* * This file is part of the Linux kernel. * - * Copyright (c) 2011, Intel Corporation + * Copyright (c) 2011-2014, Intel Corporation * Authors: Fenghua Yu <fenghua.yu@intel.com>, * H. Peter Anvin <hpa@linux.intel.com> * @@ -31,10 +31,13 @@ #define RDRAND_RETRY_LOOPS 10 #define RDRAND_INT ".byte 0x0f,0xc7,0xf0" +#define RDSEED_INT ".byte 0x0f,0xc7,0xf8" #ifdef CONFIG_X86_64 # define RDRAND_LONG ".byte 0x48,0x0f,0xc7,0xf0" +# define RDSEED_LONG ".byte 0x48,0x0f,0xc7,0xf8" #else # define RDRAND_LONG RDRAND_INT +# define RDSEED_LONG RDSEED_INT #endif #ifdef CONFIG_ARCH_RANDOM @@ -53,6 +56,16 @@ static inline int rdrand_long(unsigned long *v) return ok; } +/* A single attempt at RDSEED */ +static inline bool rdseed_long(unsigned long *v) +{ + unsigned char ok; + asm volatile(RDSEED_LONG "\n\t" + "setc %0" + : "=qm" (ok), "=a" (*v)); + return ok; +} + #define GET_RANDOM(name, type, rdrand, nop) \ static inline int name(type *v) \ { \ @@ -70,18 +83,40 @@ static inline int name(type *v) \ return ok; \ } +#define GET_SEED(name, type, rdseed, nop) \ +static inline int name(type *v) \ +{ \ + unsigned char ok; \ + alternative_io("movb $0, %0\n\t" \ + nop, \ + rdseed "\n\t" \ + "setc %0", \ + X86_FEATURE_RDSEED, \ + ASM_OUTPUT2("=q" (ok), "=a" (*v))); \ + return ok; \ +} + #ifdef CONFIG_X86_64 GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP5); GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP4); +GET_SEED(arch_get_random_seed_long, unsigned long, RDSEED_LONG, ASM_NOP5); +GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4); + #else GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP3); GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3); +GET_SEED(arch_get_random_seed_long, unsigned long, RDSEED_LONG, ASM_NOP4); +GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4); + #endif /* CONFIG_X86_64 */ +#define arch_has_random() static_cpu_has(X86_FEATURE_RDRAND) +#define arch_has_random_seed() static_cpu_has(X86_FEATURE_RDSEED) + #else static inline int rdrand_long(unsigned long *v) @@ -89,6 +124,11 @@ static inline int rdrand_long(unsigned long *v) return 0; } +static inline bool rdseed_long(unsigned long *v) +{ + return 0; +} + #endif /* CONFIG_ARCH_RANDOM */ extern void x86_init_rdrand(struct cpuinfo_x86 *c); diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index b17f4f48ecd..6dd1c7dd047 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -7,6 +7,7 @@ #include <asm/alternative.h> #include <asm/cmpxchg.h> #include <asm/rmwcc.h> +#include <asm/barrier.h> /* * Atomic operations that C can't guarantee us. Useful for @@ -243,12 +244,6 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2) : : "r" ((unsigned)(mask)), "m" (*(addr)) \ : "memory") -/* Atomic operations are already serializing on x86 */ -#define smp_mb__before_atomic_dec() barrier() -#define smp_mb__after_atomic_dec() barrier() -#define smp_mb__before_atomic_inc() barrier() -#define smp_mb__after_atomic_inc() barrier() - #ifdef CONFIG_X86_32 # include <asm/atomic64_32.h> #else diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 69bbb484502..5c7198cca5e 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -137,6 +137,10 @@ do { \ #endif +/* Atomic operations are already serializing on x86 */ +#define smp_mb__before_atomic() barrier() +#define smp_mb__after_atomic() barrier() + /* * Stop RDTSC speculation. This is needed when you need to use RDTSC * (or get_cycles or vread that possibly accesses the TSC) in a defined diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 9fc1af74dc8..afcd35d331d 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -15,6 +15,7 @@ #include <linux/compiler.h> #include <asm/alternative.h> #include <asm/rmwcc.h> +#include <asm/barrier.h> #if BITS_PER_LONG == 32 # define _BITOPS_LONG_SHIFT 5 @@ -102,7 +103,7 @@ static inline void __set_bit(long nr, volatile unsigned long *addr) * * clear_bit() is atomic and may not be reordered. However, it does * not contain a memory barrier, so if it is used for locking purposes, - * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic() * in order to ensure changes are visible on other processors. */ static __always_inline void @@ -156,9 +157,6 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) __clear_bit(nr, addr); } -#define smp_mb__before_clear_bit() barrier() -#define smp_mb__after_clear_bit() barrier() - /** * __change_bit - Toggle a bit in memory * @nr: the bit to change diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 2f03ff018d3..ba38ebbaced 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_BUG_H #define _ASM_X86_BUG_H -#ifdef CONFIG_BUG #define HAVE_ARCH_BUG #ifdef CONFIG_DEBUG_BUGVERBOSE @@ -33,8 +32,6 @@ do { \ } while (0) #endif -#endif /* !CONFIG_BUG */ - #include <asm-generic/bug.h> #endif /* _ASM_X86_BUG_H */ diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index 16a57f4ed64..eda81dc0f4a 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -3,8 +3,6 @@ #ifndef _ASM_X86_CLOCKSOURCE_H #define _ASM_X86_CLOCKSOURCE_H -#ifdef CONFIG_X86_64 - #define VCLOCK_NONE 0 /* No vDSO clock available. */ #define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ #define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ @@ -14,6 +12,4 @@ struct arch_clocksource_data { int vclock_mode; }; -#endif /* CONFIG_X86_64 */ - #endif /* _ASM_X86_CLOCKSOURCE_H */ diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h new file mode 100644 index 00000000000..e01f7f7ccb0 --- /dev/null +++ b/arch/x86/include/asm/cmdline.h @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_CMDLINE_H +#define _ASM_X86_CMDLINE_H + +int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); + +#endif /* _ASM_X86_CMDLINE_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index bc507d7640f..e265ff95d16 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -37,7 +37,7 @@ #define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */ #define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */ #define X86_FEATURE_PN (0*32+18) /* Processor serial number */ -#define X86_FEATURE_CLFLSH (0*32+19) /* "clflush" CLFLUSH instruction */ +#define X86_FEATURE_CLFLUSH (0*32+19) /* CLFLUSH instruction */ #define X86_FEATURE_DS (0*32+21) /* "dts" Debug Store */ #define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */ #define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ @@ -318,7 +318,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN) #define cpu_has_ds boot_cpu_has(X86_FEATURE_DS) #define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS) -#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH) +#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLUSH) #define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS) #define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES) #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) @@ -546,6 +546,13 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) #define static_cpu_has_bug(bit) static_cpu_has((bit)) #define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) +#define MAX_CPU_FEATURES (NCAPINTS * 32) +#define cpu_have_feature boot_cpu_has + +#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X" +#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ + boot_cpu_data.x86_model + #endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ #endif /* _ASM_X86_CPUFEATURE_H */ diff --git a/arch/x86/include/asm/cputime.h b/arch/x86/include/asm/cputime.h deleted file mode 100644 index 6d68ad7e0ea..00000000000 --- a/arch/x86/include/asm/cputime.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/cputime.h> diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 9c999c1674f..2c71182d30e 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -281,16 +281,12 @@ do { \ #define STACK_RND_MASK (0x7ff) -#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO)) - #define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled) /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ #else /* CONFIG_X86_32 */ -#define VDSO_HIGH_BASE 0xffffe000U /* CONFIG_COMPAT_VDSO address */ - /* 1GB for 64bit, 8MB for 32bit */ #define STACK_RND_MASK (test_thread_flag(TIF_ADDR32) ? 0x7ff : 0x3fffff) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 7252cd33917..43f482a0db3 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -40,15 +40,8 @@ */ extern unsigned long __FIXADDR_TOP; #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) - -#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) -#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) #else #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) - -/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */ -#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL) -#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) #endif @@ -74,7 +67,6 @@ extern unsigned long __FIXADDR_TOP; enum fixed_addresses { #ifdef CONFIG_X86_32 FIX_HOLE, - FIX_VDSO, #else VSYSCALL_LAST_PAGE, VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE @@ -98,12 +90,6 @@ enum fixed_addresses { FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, #endif -#ifdef CONFIG_X86_VISWS_APIC - FIX_CO_CPU, /* Cobalt timer */ - FIX_CO_APIC, /* Cobalt APIC Redirection Table */ - FIX_LI_PCIA, /* Lithium PCI Bridge A */ - FIX_LI_PCIB, /* Lithium PCI Bridge B */ -#endif FIX_RO_IDT, /* Virtual mapping for read-only IDT */ #ifdef CONFIG_X86_32 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ @@ -177,5 +163,11 @@ static inline void __set_fixmap(enum fixed_addresses idx, #include <asm-generic/fixmap.h> +#define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags) +#define __late_clear_fixmap(idx) __set_fixmap(idx, 0, __pgprot(0)) + +void __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FIXMAP_H */ diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index d3d74698dce..1c7eefe3250 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -145,10 +145,10 @@ static int fd_request_irq(void) { if (can_use_virtual_dma) return request_irq(FLOPPY_IRQ, floppy_hardint, - IRQF_DISABLED, "floppy", NULL); + 0, "floppy", NULL); else return request_irq(FLOPPY_IRQ, floppy_interrupt, - IRQF_DISABLED, "floppy", NULL); + 0, "floppy", NULL); } static unsigned long dma_mem_alloc(unsigned long size) diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index ab0ae1aa6d0..230853da4ec 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -33,6 +33,9 @@ typedef struct { #ifdef CONFIG_X86_MCE_THRESHOLD unsigned int irq_threshold_count; #endif +#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) + unsigned int irq_hv_callback_count; +#endif } ____cacheline_aligned irq_cpustat_t; DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index b18df579c0e..36f7125945e 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -63,6 +63,7 @@ /* hpet memory map physical address */ extern unsigned long hpet_address; extern unsigned long force_hpet_address; +extern int boot_hpet_disable; extern u8 hpet_blockid; extern int hpet_force_user; extern u8 hpet_msi_disable; diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index a8091216963..68c05398bba 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -52,6 +52,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + ptep_clear_flush(vma, addr, ptep); } static inline int huge_pte_none(pte_t pte) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 67d69b8e2d2..4615906d83d 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -98,7 +98,6 @@ extern void trace_call_function_single_interrupt(void); #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs)) extern unsigned long io_apic_irqs; -extern void init_VISWS_APIC_irqs(void); extern void setup_IO_APIC(void); extern void disable_IO_APIC(void); @@ -191,8 +190,8 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); #define trace_interrupt interrupt #endif -#define VECTOR_UNDEFINED -1 -#define VECTOR_RETRIGGERED -2 +#define VECTOR_UNDEFINED (-1) +#define VECTOR_RETRIGGERED (-2) typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 91d9c69a629..b8237d8a1e0 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -39,6 +39,7 @@ #include <linux/string.h> #include <linux/compiler.h> #include <asm/page.h> +#include <asm/early_ioremap.h> #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -316,19 +317,6 @@ extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, unsigned long prot_val); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); -/* - * early_ioremap() and early_iounmap() are for temporary early boot-time - * mappings, before the real ioremap() is functional. - * A boot-time mapping is currently limited to at most 16 pages. - */ -extern void early_ioremap_init(void); -extern void early_ioremap_reset(void); -extern void __iomem *early_ioremap(resource_size_t phys_addr, - unsigned long size); -extern void __iomem *early_memremap(resource_size_t phys_addr, - unsigned long size); -extern void early_iounmap(void __iomem *addr, unsigned long size); -extern void fixup_early_ioremap(void); extern bool is_early_ioremap_ptep(pte_t *ptep); #ifdef CONFIG_XEN diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index 8e71c794176..57995f0596a 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -50,6 +50,32 @@ #define BT_MBI_PCIE_READ 0x00 #define BT_MBI_PCIE_WRITE 0x01 +/* Quark available units */ +#define QRK_MBI_UNIT_HBA 0x00 +#define QRK_MBI_UNIT_HB 0x03 +#define QRK_MBI_UNIT_RMU 0x04 +#define QRK_MBI_UNIT_MM 0x05 +#define QRK_MBI_UNIT_MMESRAM 0x05 +#define QRK_MBI_UNIT_SOC 0x31 + +/* Quark read/write opcodes */ +#define QRK_MBI_HBA_READ 0x10 +#define QRK_MBI_HBA_WRITE 0x11 +#define QRK_MBI_HB_READ 0x10 +#define QRK_MBI_HB_WRITE 0x11 +#define QRK_MBI_RMU_READ 0x10 +#define QRK_MBI_RMU_WRITE 0x11 +#define QRK_MBI_MM_READ 0x10 +#define QRK_MBI_MM_WRITE 0x11 +#define QRK_MBI_MMESRAM_READ 0x12 +#define QRK_MBI_MMESRAM_WRITE 0x13 +#define QRK_MBI_SOC_READ 0x06 +#define QRK_MBI_SOC_WRITE 0x07 + +#if IS_ENABLED(CONFIG_IOSF_MBI) + +bool iosf_mbi_available(void); + /** * iosf_mbi_read() - MailBox Interface read command * @port: port indicating subunit being accessed @@ -87,4 +113,33 @@ int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr); */ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask); +#else /* CONFIG_IOSF_MBI is not enabled */ +static inline +bool iosf_mbi_available(void) +{ + return false; +} + +static inline +int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr) +{ + WARN(1, "IOSF_MBI driver not available"); + return -EPERM; +} + +static inline +int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr) +{ + WARN(1, "IOSF_MBI driver not available"); + return -EPERM; +} + +static inline +int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) +{ + WARN(1, "IOSF_MBI driver not available"); + return -EPERM; +} +#endif /* CONFIG_IOSF_MBI */ + #endif /* IOSF_MBI_SYMS_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fdf83afbb7d..7de069afb38 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -60,7 +60,7 @@ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ - | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) @@ -337,6 +337,11 @@ struct kvm_pmu { u64 reprogram_pmi; }; +enum { + KVM_DEBUGREG_BP_ENABLED = 1, + KVM_DEBUGREG_WONT_EXIT = 2, +}; + struct kvm_vcpu_arch { /* * rip and regs accesses must go through @@ -444,7 +449,6 @@ struct kvm_vcpu_arch { } st; u64 last_guest_tsc; - u64 last_kernel_ns; u64 last_host_tsc; u64 tsc_offset_adjustment; u64 this_tsc_nsec; @@ -464,7 +468,7 @@ struct kvm_vcpu_arch { struct mtrr_state_type mtrr_state; u32 pat; - int switch_db_regs; + unsigned switch_db_regs; unsigned long db[KVM_NR_DB_REGS]; unsigned long dr6; unsigned long dr7; @@ -599,6 +603,8 @@ struct kvm_arch { bool use_master_clock; u64 master_kernel_ns; cycle_t master_cycle_now; + struct delayed_work kvmclock_update_work; + struct delayed_work kvmclock_sync_work; struct kvm_xen_hvm_config xen_hvm_config; @@ -702,6 +708,7 @@ struct kvm_x86_ops { void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); u64 (*get_dr6)(struct kvm_vcpu *vcpu); void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); + void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); @@ -728,8 +735,8 @@ struct kvm_x86_ops { int (*nmi_allowed)(struct kvm_vcpu *vcpu); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); - int (*enable_nmi_window)(struct kvm_vcpu *vcpu); - int (*enable_irq_window)(struct kvm_vcpu *vcpu); + void (*enable_nmi_window)(struct kvm_vcpu *vcpu); + void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); int (*vm_has_apicv)(struct kvm *kvm); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); @@ -765,6 +772,9 @@ struct kvm_x86_ops { struct x86_instruction_info *info, enum x86_intercept_stage stage); void (*handle_external_intr)(struct kvm_vcpu *vcpu); + bool (*mpx_supported)(void); + + int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); }; struct kvm_arch_async_pf { diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index b59827e7652..64dc362506b 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -25,6 +25,7 @@ struct cpu_signature { struct device; enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; +extern bool dis_ucode_ldr; struct microcode_ops { enum ucode_state (*request_microcode_user) (int cpu, diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 8a9b3e288cb..1ec990bd7dc 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -11,9 +11,6 @@ #ifdef CONFIG_NUMA extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) - -#include <asm/numaq.h> - #endif /* CONFIG_NUMA */ #ifdef CONFIG_DISCONTIGMEM diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 3e6b4920ef5..f5a61795673 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -25,12 +25,6 @@ extern int pic_mode; extern unsigned int def_to_bigsmp; -#ifdef CONFIG_X86_NUMAQ -extern int mp_bus_id_to_node[MAX_MP_BUSSES]; -extern int mp_bus_id_to_local[MAX_MP_BUSSES]; -extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; -#endif - #else /* CONFIG_X86_64: */ #define MAX_MP_BUSSES 256 diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index cd9c41938b8..c163215abb9 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -2,6 +2,7 @@ #define _ASM_X86_MSHYPER_H #include <linux/types.h> +#include <linux/interrupt.h> #include <asm/hyperv.h> struct ms_hyperv_info { @@ -16,6 +17,7 @@ void hyperv_callback_vector(void); #define trace_hyperv_callback_vector hyperv_callback_vector #endif void hyperv_vector_handler(struct pt_regs *regs); -void hv_register_vmbus_handler(int irq, irq_handler_t handler); +void hv_setup_vmbus_irq(void (*handler)(void)); +void hv_remove_vmbus_irq(void); #endif diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h deleted file mode 100644 index c3b3c322fd8..00000000000 --- a/arch/x86/include/asm/numaq.h +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Written by: Patricia Gaughen, IBM Corporation - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to <gone@us.ibm.com> - */ - -#ifndef _ASM_X86_NUMAQ_H -#define _ASM_X86_NUMAQ_H - -#ifdef CONFIG_X86_NUMAQ - -extern int found_numaq; -extern int numaq_numa_init(void); -extern int pci_numaq_init(void); - -extern void *xquad_portio; - -#define XQUAD_PORTIO_BASE 0xfe400000 -#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ -#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) - -/* - * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the - */ -#define SYS_CFG_DATA_PRIV_ADDR 0x0009d000 /* place for scd in private - quad space */ - -/* - * Communication area for each processor on lynxer-processor tests. - * - * NOTE: If you change the size of this eachproc structure you need - * to change the definition for EACH_QUAD_SIZE. - */ -struct eachquadmem { - unsigned int priv_mem_start; /* Starting address of this */ - /* quad's private memory. */ - /* This is always 0. */ - /* In MB. */ - unsigned int priv_mem_size; /* Size of this quad's */ - /* private memory. */ - /* In MB. */ - unsigned int low_shrd_mem_strp_start;/* Starting address of this */ - /* quad's low shared block */ - /* (untranslated). */ - /* In MB. */ - unsigned int low_shrd_mem_start; /* Starting address of this */ - /* quad's low shared memory */ - /* (untranslated). */ - /* In MB. */ - unsigned int low_shrd_mem_size; /* Size of this quad's low */ - /* shared memory. */ - /* In MB. */ - unsigned int lmmio_copb_start; /* Starting address of this */ - /* quad's local memory */ - /* mapped I/O in the */ - /* compatibility OPB. */ - /* In MB. */ - unsigned int lmmio_copb_size; /* Size of this quad's local */ - /* memory mapped I/O in the */ - /* compatibility OPB. */ - /* In MB. */ - unsigned int lmmio_nopb_start; /* Starting address of this */ - /* quad's local memory */ - /* mapped I/O in the */ - /* non-compatibility OPB. */ - /* In MB. */ - unsigned int lmmio_nopb_size; /* Size of this quad's local */ - /* memory mapped I/O in the */ - /* non-compatibility OPB. */ - /* In MB. */ - unsigned int io_apic_0_start; /* Starting address of I/O */ - /* APIC 0. */ - unsigned int io_apic_0_sz; /* Size I/O APIC 0. */ - unsigned int io_apic_1_start; /* Starting address of I/O */ - /* APIC 1. */ - unsigned int io_apic_1_sz; /* Size I/O APIC 1. */ - unsigned int hi_shrd_mem_start; /* Starting address of this */ - /* quad's high shared memory.*/ - /* In MB. */ - unsigned int hi_shrd_mem_size; /* Size of this quad's high */ - /* shared memory. */ - /* In MB. */ - unsigned int mps_table_addr; /* Address of this quad's */ - /* MPS tables from BIOS, */ - /* in system space.*/ - unsigned int lcl_MDC_pio_addr; /* Port-I/O address for */ - /* local access of MDC. */ - unsigned int rmt_MDC_mmpio_addr; /* MM-Port-I/O address for */ - /* remote access of MDC. */ - unsigned int mm_port_io_start; /* Starting address of this */ - /* quad's memory mapped Port */ - /* I/O space. */ - unsigned int mm_port_io_size; /* Size of this quad's memory*/ - /* mapped Port I/O space. */ - unsigned int mm_rmt_io_apic_start; /* Starting address of this */ - /* quad's memory mapped */ - /* remote I/O APIC space. */ - unsigned int mm_rmt_io_apic_size; /* Size of this quad's memory*/ - /* mapped remote I/O APIC */ - /* space. */ - unsigned int mm_isa_start; /* Starting address of this */ - /* quad's memory mapped ISA */ - /* space (contains MDC */ - /* memory space). */ - unsigned int mm_isa_size; /* Size of this quad's memory*/ - /* mapped ISA space (contains*/ - /* MDC memory space). */ - unsigned int rmt_qmi_addr; /* Remote addr to access QMI.*/ - unsigned int lcl_qmi_addr; /* Local addr to access QMI. */ -}; - -/* - * Note: This structure must be NOT be changed unless the multiproc and - * OS are changed to reflect the new structure. - */ -struct sys_cfg_data { - unsigned int quad_id; - unsigned int bsp_proc_id; /* Boot Strap Processor in this quad. */ - unsigned int scd_version; /* Version number of this table. */ - unsigned int first_quad_id; - unsigned int quads_present31_0; /* 1 bit for each quad */ - unsigned int quads_present63_32; /* 1 bit for each quad */ - unsigned int config_flags; - unsigned int boot_flags; - unsigned int csr_start_addr; /* Absolute value (not in MB) */ - unsigned int csr_size; /* Absolute value (not in MB) */ - unsigned int lcl_apic_start_addr; /* Absolute value (not in MB) */ - unsigned int lcl_apic_size; /* Absolute value (not in MB) */ - unsigned int low_shrd_mem_base; /* 0 or 512MB or 1GB */ - unsigned int low_shrd_mem_quad_offset; /* 0,128M,256M,512M,1G */ - /* may not be totally populated */ - unsigned int split_mem_enbl; /* 0 for no low shared memory */ - unsigned int mmio_sz; /* Size of total system memory mapped I/O */ - /* (in MB). */ - unsigned int quad_spin_lock; /* Spare location used for quad */ - /* bringup. */ - unsigned int nonzero55; /* For checksumming. */ - unsigned int nonzeroaa; /* For checksumming. */ - unsigned int scd_magic_number; - unsigned int system_type; - unsigned int checksum; - /* - * memory configuration area for each quad - */ - struct eachquadmem eq[MAX_NUMNODES]; /* indexed by quad id */ -}; - -void numaq_tsc_disable(void); - -#endif /* CONFIG_X86_NUMAQ */ -#endif /* _ASM_X86_NUMAQ_H */ - diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 8de6d9cf3b9..678205195ae 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H -#define THREAD_SIZE_ORDER 1 +#define THREAD_SIZE_ORDER 2 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define CURRENT_MASK (~(THREAD_SIZE - 1)) diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 1ac6114c9ea..0892ea0e683 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -26,11 +26,6 @@ extern int pci_routeirq; extern int noioapicquirk; extern int noioapicreroute; -/* scan a bus after allocating a pci_sysdata for it */ -extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, - int node); -extern struct pci_bus *pci_scan_bus_with_sysdata(int busno); - #ifdef CONFIG_PCI #ifdef CONFIG_PCI_DOMAINS @@ -70,10 +65,9 @@ extern unsigned long pci_mem_start; extern int pcibios_enabled; void pcibios_config_init(void); -struct pci_bus *pcibios_scan_root(int bus); +void pcibios_scan_root(int bus); void pcibios_set_master(struct pci_dev *dev); -void pcibios_penalize_isa_irq(int irq, int active); struct irq_routing_table *pcibios_get_irq_routing_table(void); int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 94220d14d5c..851bcdc5db0 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -52,7 +52,7 @@ * Compared to the generic __my_cpu_offset version, the following * saves one instruction and avoids clobbering a temp register. */ -#define __this_cpu_ptr(ptr) \ +#define raw_cpu_ptr(ptr) \ ({ \ unsigned long tcp_ptr__; \ __verify_pcpu_ptr(ptr); \ @@ -362,25 +362,25 @@ do { \ */ #define this_cpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) -#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define __this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) - -#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_add_1(pcp, val) percpu_add_op((pcp), val) -#define __this_cpu_add_2(pcp, val) percpu_add_op((pcp), val) -#define __this_cpu_add_4(pcp, val) percpu_add_op((pcp), val) -#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) -#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) -#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) -#define __this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) -#define __this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) -#define __this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) -#define __this_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val) -#define __this_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) -#define __this_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) +#define raw_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) + +#define raw_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) +#define raw_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) +#define raw_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) +#define raw_cpu_add_1(pcp, val) percpu_add_op((pcp), val) +#define raw_cpu_add_2(pcp, val) percpu_add_op((pcp), val) +#define raw_cpu_add_4(pcp, val) percpu_add_op((pcp), val) +#define raw_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) +#define raw_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) +#define raw_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) +#define raw_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) +#define raw_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) +#define raw_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) +#define raw_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) #define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) @@ -401,16 +401,16 @@ do { \ #define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) #define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) -#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) -#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) -#define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) -#define __this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define __this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define __this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) +#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) +#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) +#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) #define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) @@ -427,7 +427,7 @@ do { \ __ret; \ }) -#define __this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double +#define raw_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double #define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double #endif /* CONFIG_X86_CMPXCHG64 */ @@ -436,22 +436,22 @@ do { \ * 32 bit must fall back to generic operations. */ #ifdef CONFIG_X86_64 -#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) -#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) -#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) -#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) -#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) - -#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) -#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) -#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) -#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) +#define raw_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) +#define raw_cpu_add_8(pcp, val) percpu_add_op((pcp), val) +#define raw_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) +#define raw_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) +#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) +#define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) +#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) + +#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) +#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) +#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) +#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) +#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) #define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) /* @@ -474,7 +474,7 @@ do { \ __ret; \ }) -#define __this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double +#define raw_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double #define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double #endif @@ -495,9 +495,9 @@ static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG; #ifdef CONFIG_X86_64 - return ((1UL << (nr % BITS_PER_LONG)) & __this_cpu_read_8(*a)) != 0; + return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0; #else - return ((1UL << (nr % BITS_PER_LONG)) & __this_cpu_read_4(*a)) != 0; + return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_4(*a)) != 0; #endif } diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 708f19fb4fc..eb3d4494513 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -214,13 +214,8 @@ #ifdef CONFIG_X86_64 #define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC #else -/* - * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection - * bits are combined, this will alow user to access the high address mapped - * VDSO in the presence of CONFIG_COMPAT_VDSO - */ #define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ -#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ +#define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */ #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ #endif diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index c8b051933b1..7024c12f7bf 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -19,12 +19,12 @@ DECLARE_PER_CPU(int, __preempt_count); */ static __always_inline int preempt_count(void) { - return __this_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; + return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; } static __always_inline void preempt_count_set(int pc) { - __this_cpu_write_4(__preempt_count, pc); + raw_cpu_write_4(__preempt_count, pc); } /* @@ -53,17 +53,17 @@ static __always_inline void preempt_count_set(int pc) static __always_inline void set_preempt_need_resched(void) { - __this_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); + raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); } static __always_inline void clear_preempt_need_resched(void) { - __this_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); + raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); } static __always_inline bool test_preempt_need_resched(void) { - return !(__this_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); + return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); } /* @@ -72,12 +72,12 @@ static __always_inline bool test_preempt_need_resched(void) static __always_inline void __preempt_count_add(int val) { - __this_cpu_add_4(__preempt_count, val); + raw_cpu_add_4(__preempt_count, val); } static __always_inline void __preempt_count_sub(int val) { - __this_cpu_add_4(__preempt_count, -val); + raw_cpu_add_4(__preempt_count, -val); } /* @@ -95,7 +95,7 @@ static __always_inline bool __preempt_count_dec_and_test(void) */ static __always_inline bool should_resched(void) { - return unlikely(!__this_cpu_read_4(__preempt_count)); + return unlikely(!raw_cpu_read_4(__preempt_count)); } #ifdef CONFIG_PREEMPT diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index fdedd38fd0f..a4ea02351f4 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -449,6 +449,15 @@ struct stack_canary { }; DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif +/* + * per-CPU IRQ handling stacks + */ +struct irq_stack { + u32 stack[THREAD_SIZE/sizeof(u32)]; +} __aligned(THREAD_SIZE); + +DECLARE_PER_CPU(struct irq_stack *, hardirq_stack); +DECLARE_PER_CPU(struct irq_stack *, softirq_stack); #endif /* X86_64 */ extern unsigned int xstate_size; diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index d62c9f809bc..9264f04a4c5 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -39,12 +39,6 @@ static inline void vsmp_init(void) { } void setup_bios_corruption_check(void); -#ifdef CONFIG_X86_VISWS -extern void visws_early_detect(void); -#else -static inline void visws_early_detect(void) { } -#endif - extern unsigned long saved_video_mode; extern void reserve_standard_io_resources(void); diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index 05af3b31d52..f28a24b51dc 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -41,7 +41,7 @@ static inline void sync_set_bit(long nr, volatile unsigned long *addr) * * sync_clear_bit() is atomic and may not be reordered. However, it does * not contain a memory barrier, so if it is used for locking purposes, - * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic() * in order to ensure changes are visible on other processors. */ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index aea284b4131..d6a756ae04c 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -13,7 +13,7 @@ #ifndef _ASM_X86_SYSCALL_H #define _ASM_X86_SYSCALL_H -#include <linux/audit.h> +#include <uapi/linux/audit.h> #include <linux/sched.h> #include <linux/err.h> #include <asm/asm-offsets.h> /* For NR_syscalls */ @@ -91,8 +91,7 @@ static inline void syscall_set_arguments(struct task_struct *task, memcpy(®s->bx + i, args, n * sizeof(args[0])); } -static inline int syscall_get_arch(struct task_struct *task, - struct pt_regs *regs) +static inline int syscall_get_arch(void) { return AUDIT_ARCH_I386; } @@ -221,8 +220,7 @@ static inline void syscall_set_arguments(struct task_struct *task, } } -static inline int syscall_get_arch(struct task_struct *task, - struct pt_regs *regs) +static inline int syscall_get_arch(void) { #ifdef CONFIG_IA32_EMULATION /* @@ -234,7 +232,7 @@ static inline int syscall_get_arch(struct task_struct *task, * * x32 tasks should be considered AUDIT_ARCH_X86_64. */ - if (task_thread_info(task)->status & TS_COMPAT) + if (task_thread_info(current)->status & TS_COMPAT) return AUDIT_ARCH_I386; #endif /* Both x32 and x86_64 are considered "64-bit". */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e1940c06ed0..854053889d4 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -9,6 +9,7 @@ #include <linux/compiler.h> #include <asm/page.h> +#include <asm/percpu.h> #include <asm/types.h> /* @@ -32,12 +33,6 @@ struct thread_info { mm_segment_t addr_limit; struct restart_block restart_block; void __user *sysenter_return; -#ifdef CONFIG_X86_32 - unsigned long previous_esp; /* ESP of the previous stack in - case of nested (IRQ) stacks - */ - __u8 supervisor_stack[0]; -#endif unsigned int sig_on_uaccess_error:1; unsigned int uaccess_err:1; /* uaccess failed */ }; @@ -88,6 +83,7 @@ struct thread_info { #define TIF_FORK 18 /* ret_from_fork */ #define TIF_NOHZ 19 /* in adaptive nohz mode */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ +#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ @@ -111,6 +107,7 @@ struct thread_info { #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) #define _TIF_NOHZ (1 << TIF_NOHZ) +#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) @@ -153,9 +150,9 @@ struct thread_info { #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) -#ifdef CONFIG_X86_32 +#define STACK_WARN (THREAD_SIZE/8) +#define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8)) -#define STACK_WARN (THREAD_SIZE/8) /* * macros/functions for gaining access to the thread information structure * @@ -163,42 +160,6 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -#define current_stack_pointer ({ \ - unsigned long sp; \ - asm("mov %%esp,%0" : "=g" (sp)); \ - sp; \ -}) - -/* how to get the thread information struct from C */ -static inline struct thread_info *current_thread_info(void) -{ - return (struct thread_info *) - (current_stack_pointer & ~(THREAD_SIZE - 1)); -} - -#else /* !__ASSEMBLY__ */ - -/* how to get the thread information struct from ASM */ -#define GET_THREAD_INFO(reg) \ - movl $-THREAD_SIZE, reg; \ - andl %esp, reg - -/* use this one if reg already contains %esp */ -#define GET_THREAD_INFO_WITH_ESP(reg) \ - andl $-THREAD_SIZE, reg - -#endif - -#else /* X86_32 */ - -#include <asm/percpu.h> -#define KERNEL_STACK_OFFSET (5*8) - -/* - * macros/functions for gaining access to the thread information structure - * preempt_count needs to be 1 initially, until the scheduler is functional. - */ -#ifndef __ASSEMBLY__ DECLARE_PER_CPU(unsigned long, kernel_stack); static inline struct thread_info *current_thread_info(void) @@ -213,8 +174,8 @@ static inline struct thread_info *current_thread_info(void) /* how to get the thread information struct from ASM */ #define GET_THREAD_INFO(reg) \ - movq PER_CPU_VAR(kernel_stack),reg ; \ - subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg + _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ + _ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ; /* * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in @@ -224,8 +185,6 @@ static inline struct thread_info *current_thread_info(void) #endif -#endif /* !X86_32 */ - /* * Thread-synchronous status. * @@ -234,8 +193,6 @@ static inline struct thread_info *current_thread_info(void) * have to worry about atomic accesses. */ #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ -#define TS_POLLING 0x0004 /* idle task polling need_resched, - skip sending interrupt */ #define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index b28097e4c8c..0e8f04f2c26 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -132,19 +132,7 @@ static inline void arch_fix_phys_package_id(int num, u32 slot) } struct pci_bus; +int x86_pci_root_bus_node(int bus); void x86_pci_root_bus_resources(int bus, struct list_head *resources); -#ifdef CONFIG_NUMA -extern int get_mp_bus_to_node(int busnum); -extern void set_mp_bus_to_node(int busnum, int node); -#else -static inline int get_mp_bus_to_node(int busnum) -{ - return 0; -} -static inline void set_mp_bus_to_node(int busnum, int node) -{ -} -#endif - #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index c2a48139c34..3f556c6a015 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -23,6 +23,9 @@ # include <asm/unistd_64.h> # include <asm/unistd_64_x32.h> # define __ARCH_WANT_COMPAT_SYS_TIME +# define __ARCH_WANT_COMPAT_SYS_GETDENTS64 +# define __ARCH_WANT_COMPAT_SYS_PREADV64 +# define __ARCH_WANT_COMPAT_SYS_PWRITEV64 # endif diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 3087ea9c5f2..93bee7b9385 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -33,15 +33,27 @@ typedef u8 uprobe_opcode_t; #define UPROBE_SWBP_INSN 0xcc #define UPROBE_SWBP_INSN_SIZE 1 +struct uprobe_xol_ops; + struct arch_uprobe { - u16 fixups; union { u8 insn[MAX_UINSN_BYTES]; u8 ixol[MAX_UINSN_BYTES]; }; + + u16 fixups; + const struct uprobe_xol_ops *ops; + + union { #ifdef CONFIG_X86_64 - unsigned long rip_rela_target_address; + unsigned long rip_rela_target_address; #endif + struct { + s32 offs; + u8 ilen; + u8 opc1; + } branch; + }; }; struct arch_uprobe_task { diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index fddb53d6391..d1dc55404ff 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -1,8 +1,45 @@ #ifndef _ASM_X86_VDSO_H #define _ASM_X86_VDSO_H +#include <asm/page_types.h> +#include <linux/linkage.h> + +#ifdef __ASSEMBLER__ + +#define DEFINE_VDSO_IMAGE(symname, filename) \ +__PAGE_ALIGNED_DATA ; \ + .globl symname##_start, symname##_end ; \ + .align PAGE_SIZE ; \ + symname##_start: ; \ + .incbin filename ; \ + symname##_end: ; \ + .align PAGE_SIZE /* extra data here leaks to userspace. */ ; \ + \ +.previous ; \ + \ + .globl symname##_pages ; \ + .bss ; \ + .align 8 ; \ + .type symname##_pages, @object ; \ + symname##_pages: ; \ + .zero (symname##_end - symname##_start + PAGE_SIZE - 1) / PAGE_SIZE * (BITS_PER_LONG / 8) ; \ + .size symname##_pages, .-symname##_pages + +#else + +#define DECLARE_VDSO_IMAGE(symname) \ + extern char symname##_start[], symname##_end[]; \ + extern struct page *symname##_pages[] + #if defined CONFIG_X86_32 || defined CONFIG_COMPAT -extern const char VDSO32_PRELINK[]; + +#include <asm/vdso32.h> + +DECLARE_VDSO_IMAGE(vdso32_int80); +#ifdef CONFIG_COMPAT +DECLARE_VDSO_IMAGE(vdso32_syscall); +#endif +DECLARE_VDSO_IMAGE(vdso32_sysenter); /* * Given a pointer to the vDSO image, find the pointer to VDSO32_name @@ -11,8 +48,7 @@ extern const char VDSO32_PRELINK[]; #define VDSO32_SYMBOL(base, name) \ ({ \ extern const char VDSO32_##name[]; \ - (void __user *)(VDSO32_##name - VDSO32_PRELINK + \ - (unsigned long)(base)); \ + (void __user *)(VDSO32_##name + (unsigned long)(base)); \ }) #endif @@ -23,12 +59,8 @@ extern const char VDSO32_PRELINK[]; extern void __user __kernel_sigreturn; extern void __user __kernel_rt_sigreturn; -/* - * These symbols are defined by vdso32.S to mark the bounds - * of the ELF DSO images included therein. - */ -extern const char vdso32_int80_start, vdso32_int80_end; -extern const char vdso32_syscall_start, vdso32_syscall_end; -extern const char vdso32_sysenter_start, vdso32_sysenter_end; +void __init patch_vdso32(void *vdso, size_t len); + +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_VDSO_H */ diff --git a/arch/x86/include/asm/vdso32.h b/arch/x86/include/asm/vdso32.h new file mode 100644 index 00000000000..7efb7018406 --- /dev/null +++ b/arch/x86/include/asm/vdso32.h @@ -0,0 +1,11 @@ +#ifndef _ASM_X86_VDSO32_H +#define _ASM_X86_VDSO32_H + +#define VDSO_BASE_PAGE 0 +#define VDSO_VVAR_PAGE 1 +#define VDSO_HPET_PAGE 2 +#define VDSO_PAGES 3 +#define VDSO_PREV_PAGES 2 +#define VDSO_OFFSET(x) ((x) * PAGE_SIZE) + +#endif diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 46e24d36b7d..3c3366c2e37 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -1,30 +1,73 @@ #ifndef _ASM_X86_VGTOD_H #define _ASM_X86_VGTOD_H -#include <asm/vsyscall.h> +#include <linux/compiler.h> #include <linux/clocksource.h> +#ifdef BUILD_VDSO32_64 +typedef u64 gtod_long_t; +#else +typedef unsigned long gtod_long_t; +#endif +/* + * vsyscall_gtod_data will be accessed by 32 and 64 bit code at the same time + * so be carefull by modifying this structure. + */ struct vsyscall_gtod_data { - seqcount_t seq; + unsigned seq; - struct { /* extract of a clocksource struct */ - int vclock_mode; - cycle_t cycle_last; - cycle_t mask; - u32 mult; - u32 shift; - } clock; + int vclock_mode; + cycle_t cycle_last; + cycle_t mask; + u32 mult; + u32 shift; /* open coded 'struct timespec' */ - time_t wall_time_sec; u64 wall_time_snsec; + gtod_long_t wall_time_sec; + gtod_long_t monotonic_time_sec; u64 monotonic_time_snsec; - time_t monotonic_time_sec; + gtod_long_t wall_time_coarse_sec; + gtod_long_t wall_time_coarse_nsec; + gtod_long_t monotonic_time_coarse_sec; + gtod_long_t monotonic_time_coarse_nsec; - struct timezone sys_tz; - struct timespec wall_time_coarse; - struct timespec monotonic_time_coarse; + int tz_minuteswest; + int tz_dsttime; }; extern struct vsyscall_gtod_data vsyscall_gtod_data; +static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s) +{ + unsigned ret; + +repeat: + ret = ACCESS_ONCE(s->seq); + if (unlikely(ret & 1)) { + cpu_relax(); + goto repeat; + } + smp_rmb(); + return ret; +} + +static inline int gtod_read_retry(const struct vsyscall_gtod_data *s, + unsigned start) +{ + smp_rmb(); + return unlikely(s->seq != start); +} + +static inline void gtod_write_begin(struct vsyscall_gtod_data *s) +{ + ++s->seq; + smp_wmb(); +} + +static inline void gtod_write_end(struct vsyscall_gtod_data *s) +{ + smp_wmb(); + ++s->seq; +} + #endif /* _ASM_X86_VGTOD_H */ diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h deleted file mode 100644 index 2edb37637ea..00000000000 --- a/arch/x86/include/asm/visws/cobalt.h +++ /dev/null @@ -1,127 +0,0 @@ -#ifndef _ASM_X86_VISWS_COBALT_H -#define _ASM_X86_VISWS_COBALT_H - -#include <asm/fixmap.h> - -/* - * Cobalt SGI Visual Workstation system ASIC - */ - -#define CO_CPU_NUM_PHYS 0x1e00 -#define CO_CPU_TAB_PHYS (CO_CPU_NUM_PHYS + 2) - -#define CO_CPU_MAX 4 - -#define CO_CPU_PHYS 0xc2000000 -#define CO_APIC_PHYS 0xc4000000 - -/* see set_fixmap() and asm/fixmap.h */ -#define CO_CPU_VADDR (fix_to_virt(FIX_CO_CPU)) -#define CO_APIC_VADDR (fix_to_virt(FIX_CO_APIC)) - -/* Cobalt CPU registers -- relative to CO_CPU_VADDR, use co_cpu_*() */ -#define CO_CPU_REV 0x08 -#define CO_CPU_CTRL 0x10 -#define CO_CPU_STAT 0x20 -#define CO_CPU_TIMEVAL 0x30 - -/* CO_CPU_CTRL bits */ -#define CO_CTRL_TIMERUN 0x04 /* 0 == disabled */ -#define CO_CTRL_TIMEMASK 0x08 /* 0 == unmasked */ - -/* CO_CPU_STATUS bits */ -#define CO_STAT_TIMEINTR 0x02 /* (r) 1 == int pend, (w) 0 == clear */ - -/* CO_CPU_TIMEVAL value */ -#define CO_TIME_HZ 100000000 /* Cobalt core rate */ - -/* Cobalt APIC registers -- relative to CO_APIC_VADDR, use co_apic_*() */ -#define CO_APIC_HI(n) (((n) * 0x10) + 4) -#define CO_APIC_LO(n) ((n) * 0x10) -#define CO_APIC_ID 0x0ffc - -/* CO_APIC_ID bits */ -#define CO_APIC_ENABLE 0x00000100 - -/* CO_APIC_LO bits */ -#define CO_APIC_MASK 0x00010000 /* 0 = enabled */ -#define CO_APIC_LEVEL 0x00008000 /* 0 = edge */ - -/* - * Where things are physically wired to Cobalt - * #defines with no board _<type>_<rev>_ are common to all (thus far) - */ -#define CO_APIC_IDE0 4 -#define CO_APIC_IDE1 2 /* Only on 320 */ - -#define CO_APIC_8259 12 /* serial, floppy, par-l-l */ - -/* Lithium PCI Bridge A -- "the one with 82557 Ethernet" */ -#define CO_APIC_PCIA_BASE0 0 /* and 1 */ /* slot 0, line 0 */ -#define CO_APIC_PCIA_BASE123 5 /* and 6 */ /* slot 0, line 1 */ - -#define CO_APIC_PIIX4_USB 7 /* this one is weird */ - -/* Lithium PCI Bridge B -- "the one with PIIX4" */ -#define CO_APIC_PCIB_BASE0 8 /* and 9-12 *//* slot 0, line 0 */ -#define CO_APIC_PCIB_BASE123 13 /* 14.15 */ /* slot 0, line 1 */ - -#define CO_APIC_VIDOUT0 16 -#define CO_APIC_VIDOUT1 17 -#define CO_APIC_VIDIN0 18 -#define CO_APIC_VIDIN1 19 - -#define CO_APIC_LI_AUDIO 22 - -#define CO_APIC_AS 24 -#define CO_APIC_RE 25 - -#define CO_APIC_CPU 28 /* Timer and Cache interrupt */ -#define CO_APIC_NMI 29 -#define CO_APIC_LAST CO_APIC_NMI - -/* - * This is how irqs are assigned on the Visual Workstation. - * Legacy devices get irq's 1-15 (system clock is 0 and is CO_APIC_CPU). - * All other devices (including PCI) go to Cobalt and are irq's 16 on up. - */ -#define CO_IRQ_APIC0 16 /* irq of apic entry 0 */ -#define IS_CO_APIC(irq) ((irq) >= CO_IRQ_APIC0) -#define CO_IRQ(apic) (CO_IRQ_APIC0 + (apic)) /* apic ent to irq */ -#define CO_APIC(irq) ((irq) - CO_IRQ_APIC0) /* irq to apic ent */ -#define CO_IRQ_IDE0 14 /* knowledge of... */ -#define CO_IRQ_IDE1 15 /* ... ide driver defaults! */ -#define CO_IRQ_8259 CO_IRQ(CO_APIC_8259) - -#ifdef CONFIG_X86_VISWS_APIC -static inline void co_cpu_write(unsigned long reg, unsigned long v) -{ - *((volatile unsigned long *)(CO_CPU_VADDR+reg))=v; -} - -static inline unsigned long co_cpu_read(unsigned long reg) -{ - return *((volatile unsigned long *)(CO_CPU_VADDR+reg)); -} - -static inline void co_apic_write(unsigned long reg, unsigned long v) -{ - *((volatile unsigned long *)(CO_APIC_VADDR+reg))=v; -} - -static inline unsigned long co_apic_read(unsigned long reg) -{ - return *((volatile unsigned long *)(CO_APIC_VADDR+reg)); -} -#endif - -extern char visws_board_type; - -#define VISWS_320 0 -#define VISWS_540 1 - -extern char visws_board_rev; - -extern int pci_visws_init(void); - -#endif /* _ASM_X86_VISWS_COBALT_H */ diff --git a/arch/x86/include/asm/visws/lithium.h b/arch/x86/include/asm/visws/lithium.h deleted file mode 100644 index a10d89bc127..00000000000 --- a/arch/x86/include/asm/visws/lithium.h +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef _ASM_X86_VISWS_LITHIUM_H -#define _ASM_X86_VISWS_LITHIUM_H - -#include <asm/fixmap.h> - -/* - * Lithium is the SGI Visual Workstation I/O ASIC - */ - -#define LI_PCI_A_PHYS 0xfc000000 /* Enet is dev 3 */ -#define LI_PCI_B_PHYS 0xfd000000 /* PIIX4 is here */ - -/* see set_fixmap() and asm/fixmap.h */ -#define LI_PCIA_VADDR (fix_to_virt(FIX_LI_PCIA)) -#define LI_PCIB_VADDR (fix_to_virt(FIX_LI_PCIB)) - -/* Not a standard PCI? (not in linux/pci.h) */ -#define LI_PCI_BUSNUM 0x44 /* lo8: primary, hi8: sub */ -#define LI_PCI_INTEN 0x46 - -/* LI_PCI_INTENT bits */ -#define LI_INTA_0 0x0001 -#define LI_INTA_1 0x0002 -#define LI_INTA_2 0x0004 -#define LI_INTA_3 0x0008 -#define LI_INTA_4 0x0010 -#define LI_INTB 0x0020 -#define LI_INTC 0x0040 -#define LI_INTD 0x0080 - -/* More special purpose macros... */ -static inline void li_pcia_write16(unsigned long reg, unsigned short v) -{ - *((volatile unsigned short *)(LI_PCIA_VADDR+reg))=v; -} - -static inline unsigned short li_pcia_read16(unsigned long reg) -{ - return *((volatile unsigned short *)(LI_PCIA_VADDR+reg)); -} - -static inline void li_pcib_write16(unsigned long reg, unsigned short v) -{ - *((volatile unsigned short *)(LI_PCIB_VADDR+reg))=v; -} - -static inline unsigned short li_pcib_read16(unsigned long reg) -{ - return *((volatile unsigned short *)(LI_PCIB_VADDR+reg)); -} - -#endif /* _ASM_X86_VISWS_LITHIUM_H */ - diff --git a/arch/x86/include/asm/visws/piix4.h b/arch/x86/include/asm/visws/piix4.h deleted file mode 100644 index d0af4d338e7..00000000000 --- a/arch/x86/include/asm/visws/piix4.h +++ /dev/null @@ -1,107 +0,0 @@ -#ifndef _ASM_X86_VISWS_PIIX4_H -#define _ASM_X86_VISWS_PIIX4_H - -/* - * PIIX4 as used on SGI Visual Workstations - */ - -#define PIIX_PM_START 0x0F80 - -#define SIO_GPIO_START 0x0FC0 - -#define SIO_PM_START 0x0FC8 - -#define PMBASE PIIX_PM_START -#define GPIREG0 (PMBASE+0x30) -#define GPIREG(x) (GPIREG0+((x)/8)) -#define GPIBIT(x) (1 << ((x)%8)) - -#define PIIX_GPI_BD_ID1 18 -#define PIIX_GPI_BD_ID2 19 -#define PIIX_GPI_BD_ID3 20 -#define PIIX_GPI_BD_ID4 21 -#define PIIX_GPI_BD_REG GPIREG(PIIX_GPI_BD_ID1) -#define PIIX_GPI_BD_MASK (GPIBIT(PIIX_GPI_BD_ID1) | \ - GPIBIT(PIIX_GPI_BD_ID2) | \ - GPIBIT(PIIX_GPI_BD_ID3) | \ - GPIBIT(PIIX_GPI_BD_ID4) ) - -#define PIIX_GPI_BD_SHIFT (PIIX_GPI_BD_ID1 % 8) - -#define SIO_INDEX 0x2e -#define SIO_DATA 0x2f - -#define SIO_DEV_SEL 0x7 -#define SIO_DEV_ENB 0x30 -#define SIO_DEV_MSB 0x60 -#define SIO_DEV_LSB 0x61 - -#define SIO_GP_DEV 0x7 - -#define SIO_GP_BASE SIO_GPIO_START -#define SIO_GP_MSB (SIO_GP_BASE>>8) -#define SIO_GP_LSB (SIO_GP_BASE&0xff) - -#define SIO_GP_DATA1 (SIO_GP_BASE+0) - -#define SIO_PM_DEV 0x8 - -#define SIO_PM_BASE SIO_PM_START -#define SIO_PM_MSB (SIO_PM_BASE>>8) -#define SIO_PM_LSB (SIO_PM_BASE&0xff) -#define SIO_PM_INDEX (SIO_PM_BASE+0) -#define SIO_PM_DATA (SIO_PM_BASE+1) - -#define SIO_PM_FER2 0x1 - -#define SIO_PM_GP_EN 0x80 - - - -/* - * This is the dev/reg where generating a config cycle will - * result in a PCI special cycle. - */ -#define SPECIAL_DEV 0xff -#define SPECIAL_REG 0x00 - -/* - * PIIX4 needs to see a special cycle with the following data - * to be convinced the processor has gone into the stop grant - * state. PIIX4 insists on seeing this before it will power - * down a system. - */ -#define PIIX_SPECIAL_STOP 0x00120002 - -#define PIIX4_RESET_PORT 0xcf9 -#define PIIX4_RESET_VAL 0x6 - -#define PMSTS_PORT 0xf80 // 2 bytes PM Status -#define PMEN_PORT 0xf82 // 2 bytes PM Enable -#define PMCNTRL_PORT 0xf84 // 2 bytes PM Control - -#define PM_SUSPEND_ENABLE 0x2000 // start sequence to suspend state - -/* - * PMSTS and PMEN I/O bit definitions. - * (Bits are the same in both registers) - */ -#define PM_STS_RSM (1<<15) // Resume Status -#define PM_STS_PWRBTNOR (1<<11) // Power Button Override -#define PM_STS_RTC (1<<10) // RTC status -#define PM_STS_PWRBTN (1<<8) // Power Button Pressed? -#define PM_STS_GBL (1<<5) // Global Status -#define PM_STS_BM (1<<4) // Bus Master Status -#define PM_STS_TMROF (1<<0) // Timer Overflow Status. - -/* - * Stop clock GPI register - */ -#define PIIX_GPIREG0 (0xf80 + 0x30) - -/* - * Stop clock GPI bit in GPIREG0 - */ -#define PIIX_GPI_STPCLK 0x4 // STPCLK signal routed back in - -#endif /* _ASM_X86_VISWS_PIIX4_H */ diff --git a/arch/x86/include/asm/visws/sgivw.h b/arch/x86/include/asm/visws/sgivw.h deleted file mode 100644 index 5fbf63e1003..00000000000 --- a/arch/x86/include/asm/visws/sgivw.h +++ /dev/null @@ -1,5 +0,0 @@ -/* - * Frame buffer position and size: - */ -extern unsigned long sgivwfb_mem_phys; -extern unsigned long sgivwfb_mem_size; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2067264fb7f..7004d21e621 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -85,6 +85,7 @@ #define VM_EXIT_SAVE_IA32_EFER 0x00100000 #define VM_EXIT_LOAD_IA32_EFER 0x00200000 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 +#define VM_EXIT_CLEAR_BNDCFGS 0x00800000 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff @@ -95,6 +96,7 @@ #define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000 #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 #define VM_ENTRY_LOAD_IA32_EFER 0x00008000 +#define VM_ENTRY_LOAD_BNDCFGS 0x00010000 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff @@ -174,6 +176,8 @@ enum vmcs_field { GUEST_PDPTR2_HIGH = 0x0000280f, GUEST_PDPTR3 = 0x00002810, GUEST_PDPTR3_HIGH = 0x00002811, + GUEST_BNDCFGS = 0x00002812, + GUEST_BNDCFGS_HIGH = 0x00002813, HOST_IA32_PAT = 0x00002c00, HOST_IA32_PAT_HIGH = 0x00002c01, HOST_IA32_EFER = 0x00002c02, diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index d76ac40da20..081d909bc49 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -16,8 +16,8 @@ * you mess up, the linker will catch it.) */ -/* Base address of vvars. This is not ABI. */ -#define VVAR_ADDRESS (-10*1024*1024 - 4096) +#ifndef _ASM_X86_VVAR_H +#define _ASM_X86_VVAR_H #if defined(__VVAR_KERNEL_LDS) @@ -29,16 +29,35 @@ #else +#ifdef BUILD_VDSO32 + +#define DECLARE_VVAR(offset, type, name) \ + extern type vvar_ ## name __attribute__((visibility("hidden"))); + +#define VVAR(name) (vvar_ ## name) + +#else + +extern char __vvar_page; + +/* Base address of vvars. This is not ABI. */ +#ifdef CONFIG_X86_64 +#define VVAR_ADDRESS (-10*1024*1024 - 4096) +#else +#define VVAR_ADDRESS (&__vvar_page) +#endif + #define DECLARE_VVAR(offset, type, name) \ static type const * const vvaraddr_ ## name = \ (void *)(VVAR_ADDRESS + (offset)); +#define VVAR(name) (*vvaraddr_ ## name) +#endif + #define DEFINE_VVAR(type, name) \ type name \ __attribute__((section(".vvar_" #name), aligned(16))) __visible -#define VVAR(name) (*vvaraddr_ ## name) - #endif /* DECLARE_VVAR(offset, type, name) */ @@ -48,3 +67,5 @@ DECLARE_VVAR(16, int, vgetcpu_mode) DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) #undef DECLARE_VVAR + +#endif diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index e709884d0ef..ca08a27b90b 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -343,7 +343,7 @@ HYPERVISOR_memory_op(unsigned int cmd, void *arg) } static inline int -HYPERVISOR_multicall(void *call_list, int nr_calls) +HYPERVISOR_multicall(void *call_list, uint32_t nr_calls) { return _hypercall2(int, multicall, call_list, nr_calls); } diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index fd9cb7695b5..3400dbaec3c 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -54,6 +54,9 @@ typedef unsigned long xen_pfn_t; #define PRI_xen_pfn "lx" typedef unsigned long xen_ulong_t; #define PRI_xen_ulong "lx" +typedef long xen_long_t; +#define PRI_xen_long "lx" + /* Guest handles for primitive C types. */ __DEFINE_GUEST_HANDLE(uchar, unsigned char); __DEFINE_GUEST_HANDLE(uint, unsigned int); diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 3e276eb23d1..c949923a566 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -49,10 +49,17 @@ extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern unsigned long set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e); +extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count); extern int m2p_add_override(unsigned long mfn, struct page *page, struct gnttab_map_grant_ref *kmap_op); +extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count); extern int m2p_remove_override(struct page *page, - struct gnttab_map_grant_ref *kmap_op); + struct gnttab_map_grant_ref *kmap_op, + unsigned long mfn); extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); @@ -121,7 +128,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) pfn = m2p_find_override_pfn(mfn, ~0); } - /* + /* * pfn is ~0 if there are no entries in the m2p for mfn or if the * entry doesn't map back to the mfn and m2p_override doesn't have a * valid entry for it. diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 6c1d7411eb0..d949ef28c48 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -16,6 +16,8 @@ #define XSTATE_Hi16_ZMM 0x80 #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) +/* Bit 63 of XCR0 is reserved for future expansion */ +#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) #define FXSAVE_SIZE 512 diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 4924f4be2b9..fcf2b3ae1bf 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -295,6 +295,7 @@ #define MSR_SMI_COUNT 0x00000034 #define MSR_IA32_FEATURE_CONTROL 0x0000003a #define MSR_IA32_TSC_ADJUST 0x0000003b +#define MSR_IA32_BNDCFGS 0x00000d90 #define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) @@ -383,7 +384,7 @@ #define MSR_IA32_MISC_ENABLE_MWAIT_BIT 18 #define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << MSR_IA32_MISC_ENABLE_MWAIT_BIT) #define MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT 22 -#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT); +#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) #define MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT 23 #define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT) #define MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT 34 diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cb648c84b32..f4d96000d33 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -26,7 +26,7 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-y += syscall_$(BITS).o +obj-y += syscall_$(BITS).o vsyscall_gtod.o obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o obj-$(CONFIG_SYSFS) += ksysfs.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 9f46f2b1cfc..86281ffb96d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -53,10 +53,6 @@ EXPORT_SYMBOL(acpi_disabled); # include <asm/proto.h> #endif /* X86 */ -#define BAD_MADT_ENTRY(entry, end) ( \ - (!entry) || (unsigned long)entry + sizeof(*entry) > end || \ - ((struct acpi_subtable_header *)entry)->length < sizeof(*entry)) - #define PREFIX "ACPI: " int acpi_noirq; /* skip ACPI IRQ initialization */ @@ -907,10 +903,6 @@ static int __init acpi_parse_madt_lapic_entries(void) #ifdef CONFIG_X86_IO_APIC #define MP_ISA_BUS 0 -#ifdef CONFIG_X86_ES7000 -extern int es7000_plat; -#endif - void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { int ioapic; @@ -960,14 +952,6 @@ void __init mp_config_acpi_legacy_irqs(void) set_bit(MP_ISA_BUS, mp_bus_not_pci); pr_debug("Bus #%d is ISA\n", MP_ISA_BUS); -#ifdef CONFIG_X86_ES7000 - /* - * Older generations of ES7000 have no legacy identity mappings - */ - if (es7000_plat == 1) - return; -#endif - /* * Use the default configuration for the IRQs 0-15. Unless * overridden by (MADT) interrupt source override entries. diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index e69182fd01c..4b28159e042 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -87,7 +87,9 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK; retval = 0; - if (num_cstate_subtype < (cx->address & MWAIT_SUBSTATE_MASK)) { + /* If the HW does not support any sub-states in this C-state */ + if (num_cstate_subtype == 0) { + pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", cx->address, edx_part); retval = -1; goto out; } diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 3a2ae4c8894..31368207837 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -31,7 +31,7 @@ static char temp_stack[4096]; * * Wrapper around acpi_enter_sleep_state() to be called by assmebly. */ -acpi_status asmlinkage x86_acpi_enter_sleep_state(u8 state) +acpi_status asmlinkage __visible x86_acpi_enter_sleep_state(u8 state) { return acpi_enter_sleep_state(state); } diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index dec8de4e166..f04dbb3069b 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -22,6 +22,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, {} }; EXPORT_SYMBOL(amd_nb_misc_ids); @@ -30,6 +31,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, {} }; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 9fa8aa051f5..76164e173a2 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -10,6 +10,8 @@ * * Copyright 2002 Andi Kleen, SuSE Labs. */ +#define pr_fmt(fmt) "AGP: " fmt + #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> @@ -75,14 +77,13 @@ static u32 __init allocate_aperture(void) addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, aper_size, aper_size); if (!addr) { - printk(KERN_ERR - "Cannot allocate aperture memory hole (%lx,%uK)\n", - addr, aper_size>>10); + pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n", + addr, addr + aper_size - 1, aper_size >> 10); return 0; } memblock_reserve(addr, aper_size); - printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", - aper_size >> 10, addr); + pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n", + addr, addr + aper_size - 1, aper_size >> 10); register_nosave_region(addr >> PAGE_SHIFT, (addr+aper_size) >> PAGE_SHIFT); @@ -126,10 +127,11 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) u64 aper; u32 old_order; - printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func); + pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func); apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14); if (apsizereg == 0xffffffff) { - printk(KERN_ERR "APSIZE in AGP bridge unreadable\n"); + pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n", + bus, slot, func); return 0; } @@ -153,16 +155,18 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) * On some sick chips, APSIZE is 0. It means it wants 4G * so let double check that order, and lets trust AMD NB settings: */ - printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n", - aper, 32 << old_order); + pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n", + bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1, + 32 << old_order); if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) { - printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n", - 32 << *order, apsizereg); + pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n", + bus, slot, func, 32 << *order, apsizereg); *order = old_order; } - printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", - aper, 32 << *order, apsizereg); + pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n", + bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1, + 32 << *order, apsizereg); if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20)) return 0; @@ -218,7 +222,7 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp) } } } - printk(KERN_INFO "No AGP bridge found\n"); + pr_info("No AGP bridge found\n"); return 0; } @@ -310,7 +314,8 @@ void __init early_gart_iommu_check(void) if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { /* reserve it, so we can reuse it in second kernel */ - printk(KERN_INFO "update e820 for GART\n"); + pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n", + aper_base, aper_base + aper_size - 1); e820_add_region(aper_base, aper_size, E820_RESERVED); update_e820(); } @@ -354,7 +359,7 @@ int __init gart_iommu_hole_init(void) !early_pci_allowed()) return -ENODEV; - printk(KERN_INFO "Checking aperture...\n"); + pr_info("Checking aperture...\n"); if (!fallback_aper_force) agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); @@ -395,8 +400,9 @@ int __init gart_iommu_hole_init(void) aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; aper_base <<= 25; - printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", - node, aper_base, aper_size >> 20); + pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n", + node, aper_base, aper_base + aper_size - 1, + aper_size >> 20); node++; if (!aperture_valid(aper_base, aper_size, 64<<20)) { @@ -407,9 +413,9 @@ int __init gart_iommu_hole_init(void) if (!no_iommu && max_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) { - printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n"); - printk(KERN_ERR "please increase GART size in your BIOS setup\n"); - printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n"); + pr_err("you are using iommu with agp, but GART size is less than 64MB\n"); + pr_err("please increase GART size in your BIOS setup\n"); + pr_err("if BIOS doesn't have that option, contact your HW vendor!\n"); printed_gart_size_msg = 1; } } else { @@ -446,13 +452,10 @@ out: force_iommu || valid_agp || fallback_aper_force) { - printk(KERN_INFO - "Your BIOS doesn't leave a aperture memory hole\n"); - printk(KERN_INFO - "Please enable the IOMMU option in the BIOS setup\n"); - printk(KERN_INFO - "This costs you %d MB of RAM\n", - 32 << fallback_aper_order); + pr_info("Your BIOS doesn't leave a aperture memory hole\n"); + pr_info("Please enable the IOMMU option in the BIOS setup\n"); + pr_info("This costs you %dMB of RAM\n", + 32 << fallback_aper_order); aper_order = fallback_aper_order; aper_alloc = allocate_aperture(); diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 0ae0323b1f9..dcb5b15401c 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -18,10 +18,7 @@ obj-y += apic_flat_64.o endif # APIC probe will depend on the listing order here -obj-$(CONFIG_X86_NUMAQ) += numaq_32.o -obj-$(CONFIG_X86_SUMMIT) += summit_32.o obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o -obj-$(CONFIG_X86_ES7000) += es7000_32.o # For 32bit, probe_32 need to be listed last obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 53e20531470..ad28db7e6bd 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1996,7 +1996,8 @@ static inline void __smp_error_interrupt(struct pt_regs *regs) }; /* First tickle the hardware, only then report what went on. -- REW */ - apic_write(APIC_ESR, 0); + if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); v = apic_read(APIC_ESR); ack_APIC_irq(); atomic_inc(&irq_err_count); @@ -2136,7 +2137,6 @@ int generic_processor_info(int apicid, int version) * * - arch/x86/kernel/mpparse.c: MP_processor_info() * - arch/x86/mm/amdtopology.c: amd_numa_init() - * - arch/x86/platform/visws/visws_quirks.c: MP_processor_info() * * This function is executed with the modified * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c deleted file mode 100644 index 6f8f8b348a3..00000000000 --- a/arch/x86/kernel/apic/es7000_32.c +++ /dev/null @@ -1,738 +0,0 @@ -/* - * Written by: Garry Forsgren, Unisys Corporation - * Natalie Protasevich, Unisys Corporation - * - * This file contains the code to configure and interface - * with Unisys ES7000 series hardware system manager. - * - * Copyright (c) 2003 Unisys Corporation. - * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar - * - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Unisys Corporation, Township Line & Union Meeting - * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: - * - * http://www.unisys.com - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/notifier.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> -#include <linux/threads.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/reboot.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/acpi.h> -#include <linux/init.h> -#include <linux/gfp.h> -#include <linux/nmi.h> -#include <linux/smp.h> -#include <linux/io.h> - -#include <asm/apicdef.h> -#include <linux/atomic.h> -#include <asm/fixmap.h> -#include <asm/mpspec.h> -#include <asm/setup.h> -#include <asm/apic.h> -#include <asm/ipi.h> - -/* - * ES7000 chipsets - */ - -#define NON_UNISYS 0 -#define ES7000_CLASSIC 1 -#define ES7000_ZORRO 2 - -#define MIP_REG 1 -#define MIP_PSAI_REG 4 - -#define MIP_BUSY 1 -#define MIP_SPIN 0xf0000 -#define MIP_VALID 0x0100000000000000ULL -#define MIP_SW_APIC 0x1020b - -#define MIP_PORT(val) ((val >> 32) & 0xffff) - -#define MIP_RD_LO(val) (val & 0xffffffff) - -struct mip_reg { - unsigned long long off_0x00; - unsigned long long off_0x08; - unsigned long long off_0x10; - unsigned long long off_0x18; - unsigned long long off_0x20; - unsigned long long off_0x28; - unsigned long long off_0x30; - unsigned long long off_0x38; -}; - -struct mip_reg_info { - unsigned long long mip_info; - unsigned long long delivery_info; - unsigned long long host_reg; - unsigned long long mip_reg; -}; - -struct psai { - unsigned long long entry_type; - unsigned long long addr; - unsigned long long bep_addr; -}; - -#ifdef CONFIG_ACPI - -struct es7000_oem_table { - struct acpi_table_header Header; - u32 OEMTableAddr; - u32 OEMTableSize; -}; - -static unsigned long oem_addrX; -static unsigned long oem_size; - -#endif - -/* - * ES7000 Globals - */ - -static volatile unsigned long *psai; -static struct mip_reg *mip_reg; -static struct mip_reg *host_reg; -static int mip_port; -static unsigned long mip_addr; -static unsigned long host_addr; - -int es7000_plat; - -/* - * GSI override for ES7000 platforms. - */ - - -static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) -{ - unsigned long vect = 0, psaival = 0; - - if (psai == NULL) - return -1; - - vect = ((unsigned long)__pa(eip)/0x1000) << 16; - psaival = (0x1000000 | vect | cpu); - - while (*psai & 0x1000000) - ; - - *psai = psaival; - - return 0; -} - -static int es7000_apic_is_cluster(void) -{ - /* MPENTIUMIII */ - if (boot_cpu_data.x86 == 6 && - (boot_cpu_data.x86_model >= 7 && boot_cpu_data.x86_model <= 11)) - return 1; - - return 0; -} - -static void setup_unisys(void) -{ - /* - * Determine the generation of the ES7000 currently running. - * - * es7000_plat = 1 if the machine is a 5xx ES7000 box - * es7000_plat = 2 if the machine is a x86_64 ES7000 box - * - */ - if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2)) - es7000_plat = ES7000_ZORRO; - else - es7000_plat = ES7000_CLASSIC; -} - -/* - * Parse the OEM Table: - */ -static int parse_unisys_oem(char *oemptr) -{ - int i; - int success = 0; - unsigned char type, size; - unsigned long val; - char *tp = NULL; - struct psai *psaip = NULL; - struct mip_reg_info *mi; - struct mip_reg *host, *mip; - - tp = oemptr; - - tp += 8; - - for (i = 0; i <= 6; i++) { - type = *tp++; - size = *tp++; - tp -= 2; - switch (type) { - case MIP_REG: - mi = (struct mip_reg_info *)tp; - val = MIP_RD_LO(mi->host_reg); - host_addr = val; - host = (struct mip_reg *)val; - host_reg = __va(host); - val = MIP_RD_LO(mi->mip_reg); - mip_port = MIP_PORT(mi->mip_info); - mip_addr = val; - mip = (struct mip_reg *)val; - mip_reg = __va(mip); - pr_debug("host_reg = 0x%lx\n", - (unsigned long)host_reg); - pr_debug("mip_reg = 0x%lx\n", - (unsigned long)mip_reg); - success++; - break; - case MIP_PSAI_REG: - psaip = (struct psai *)tp; - if (tp != NULL) { - if (psaip->addr) - psai = __va(psaip->addr); - else - psai = NULL; - success++; - } - break; - default: - break; - } - tp += size; - } - - if (success < 2) - es7000_plat = NON_UNISYS; - else - setup_unisys(); - - return es7000_plat; -} - -#ifdef CONFIG_ACPI -static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr) -{ - struct acpi_table_header *header = NULL; - struct es7000_oem_table *table; - acpi_size tbl_size; - acpi_status ret; - int i = 0; - - for (;;) { - ret = acpi_get_table_with_size("OEM1", i++, &header, &tbl_size); - if (!ACPI_SUCCESS(ret)) - return -1; - - if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) - break; - - early_acpi_os_unmap_memory(header, tbl_size); - } - - table = (void *)header; - - oem_addrX = table->OEMTableAddr; - oem_size = table->OEMTableSize; - - early_acpi_os_unmap_memory(header, tbl_size); - - *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, oem_size); - - return 0; -} - -static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr) -{ - if (!oem_addr) - return; - - __acpi_unmap_table((char *)oem_addr, oem_size); -} - -static int es7000_check_dsdt(void) -{ - struct acpi_table_header header; - - if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) && - !strncmp(header.oem_id, "UNISYS", 6)) - return 1; - return 0; -} - -static int es7000_acpi_ret; - -/* Hook from generic ACPI tables.c */ -static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - unsigned long oem_addr = 0; - int check_dsdt; - int ret = 0; - - /* check dsdt at first to avoid clear fix_map for oem_addr */ - check_dsdt = es7000_check_dsdt(); - - if (!find_unisys_acpi_oem_table(&oem_addr)) { - if (check_dsdt) { - ret = parse_unisys_oem((char *)oem_addr); - } else { - setup_unisys(); - ret = 1; - } - /* - * we need to unmap it - */ - unmap_unisys_acpi_oem_table(oem_addr); - } - - es7000_acpi_ret = ret; - - return ret && !es7000_apic_is_cluster(); -} - -static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id) -{ - int ret = es7000_acpi_ret; - - return ret && es7000_apic_is_cluster(); -} - -#else /* !CONFIG_ACPI: */ -static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - return 0; -} - -static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id) -{ - return 0; -} -#endif /* !CONFIG_ACPI */ - -static void es7000_spin(int n) -{ - int i = 0; - - while (i++ < n) - rep_nop(); -} - -static int es7000_mip_write(struct mip_reg *mip_reg) -{ - int status = 0; - int spin; - - spin = MIP_SPIN; - while ((host_reg->off_0x38 & MIP_VALID) != 0) { - if (--spin <= 0) { - WARN(1, "Timeout waiting for Host Valid Flag\n"); - return -1; - } - es7000_spin(MIP_SPIN); - } - - memcpy(host_reg, mip_reg, sizeof(struct mip_reg)); - outb(1, mip_port); - - spin = MIP_SPIN; - - while ((mip_reg->off_0x38 & MIP_VALID) == 0) { - if (--spin <= 0) { - WARN(1, "Timeout waiting for MIP Valid Flag\n"); - return -1; - } - es7000_spin(MIP_SPIN); - } - - status = (mip_reg->off_0x00 & 0xffff0000000000ULL) >> 48; - mip_reg->off_0x38 &= ~MIP_VALID; - - return status; -} - -static void es7000_enable_apic_mode(void) -{ - struct mip_reg es7000_mip_reg; - int mip_status; - - if (!es7000_plat) - return; - - pr_info("Enabling APIC mode.\n"); - memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); - es7000_mip_reg.off_0x00 = MIP_SW_APIC; - es7000_mip_reg.off_0x38 = MIP_VALID; - - while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0) - WARN(1, "Command failed, status = %x\n", mip_status); -} - -static unsigned int es7000_get_apic_id(unsigned long x) -{ - return (x >> 24) & 0xFF; -} - -static void es7000_send_IPI_mask(const struct cpumask *mask, int vector) -{ - default_send_IPI_mask_sequence_phys(mask, vector); -} - -static void es7000_send_IPI_allbutself(int vector) -{ - default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); -} - -static void es7000_send_IPI_all(int vector) -{ - es7000_send_IPI_mask(cpu_online_mask, vector); -} - -static int es7000_apic_id_registered(void) -{ - return 1; -} - -static const struct cpumask *target_cpus_cluster(void) -{ - return cpu_all_mask; -} - -static const struct cpumask *es7000_target_cpus(void) -{ - return cpumask_of(smp_processor_id()); -} - -static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid) -{ - return 0; -} - -static unsigned long es7000_check_apicid_present(int bit) -{ - return physid_isset(bit, phys_cpu_present_map); -} - -static int es7000_early_logical_apicid(int cpu) -{ - /* on es7000, logical apicid is the same as physical */ - return early_per_cpu(x86_bios_cpu_apicid, cpu); -} - -static unsigned long calculate_ldr(int cpu) -{ - unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); - - return SET_APIC_LOGICAL_ID(id); -} - -/* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LdR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ -static void es7000_init_apic_ldr_cluster(void) -{ - unsigned long val; - int cpu = smp_processor_id(); - - apic_write(APIC_DFR, APIC_DFR_CLUSTER); - val = calculate_ldr(cpu); - apic_write(APIC_LDR, val); -} - -static void es7000_init_apic_ldr(void) -{ - unsigned long val; - int cpu = smp_processor_id(); - - apic_write(APIC_DFR, APIC_DFR_FLAT); - val = calculate_ldr(cpu); - apic_write(APIC_LDR, val); -} - -static void es7000_setup_apic_routing(void) -{ - int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); - - pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", - (apic_version[apic] == 0x14) ? - "Physical Cluster" : "Logical Cluster", - nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); -} - -static int es7000_cpu_present_to_apicid(int mps_cpu) -{ - if (!mps_cpu) - return boot_cpu_physical_apicid; - else if (mps_cpu < nr_cpu_ids) - return per_cpu(x86_bios_cpu_apicid, mps_cpu); - else - return BAD_APICID; -} - -static int cpu_id; - -static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap) -{ - physid_set_mask_of_physid(cpu_id, retmap); - ++cpu_id; -} - -static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) -{ - /* For clustered we don't have a good way to do this yet - hack */ - physids_promote(0xFFL, retmap); -} - -static int es7000_check_phys_apicid_present(int cpu_physical_apicid) -{ - boot_cpu_physical_apicid = read_apic_id(); - return 1; -} - -static inline int -es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) -{ - unsigned int round = 0; - unsigned int cpu, uninitialized_var(apicid); - - /* - * The cpus in the mask must all be on the apic cluster. - */ - for_each_cpu_and(cpu, cpumask, cpu_online_mask) { - int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - - if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { - WARN(1, "Not a valid mask!"); - - return -EINVAL; - } - apicid |= new_apicid; - round++; - } - if (!round) - return -EINVAL; - *dest_id = apicid; - return 0; -} - -static int -es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, - const struct cpumask *andmask, - unsigned int *apicid) -{ - cpumask_var_t cpumask; - *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); - - if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) - return 0; - - cpumask_and(cpumask, inmask, andmask); - es7000_cpu_mask_to_apicid(cpumask, apicid); - - free_cpumask_var(cpumask); - - return 0; -} - -static int es7000_phys_pkg_id(int cpuid_apic, int index_msb) -{ - return cpuid_apic >> index_msb; -} - -static int probe_es7000(void) -{ - /* probed later in mptable/ACPI hooks */ - return 0; -} - -static int es7000_mps_ret; -static int es7000_mps_oem_check(struct mpc_table *mpc, char *oem, - char *productid) -{ - int ret = 0; - - if (mpc->oemptr) { - struct mpc_oemtable *oem_table = - (struct mpc_oemtable *)mpc->oemptr; - - if (!strncmp(oem, "UNISYS", 6)) - ret = parse_unisys_oem((char *)oem_table); - } - - es7000_mps_ret = ret; - - return ret && !es7000_apic_is_cluster(); -} - -static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem, - char *productid) -{ - int ret = es7000_mps_ret; - - return ret && es7000_apic_is_cluster(); -} - -/* We've been warned by a false positive warning.Use __refdata to keep calm. */ -static struct apic __refdata apic_es7000_cluster = { - - .name = "es7000", - .probe = probe_es7000, - .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = es7000_apic_id_registered, - - .irq_delivery_mode = dest_LowestPrio, - /* logical delivery broadcast to all procs: */ - .irq_dest_mode = 1, - - .target_cpus = target_cpus_cluster, - .disable_esr = 1, - .dest_logical = 0, - .check_apicid_used = es7000_check_apicid_used, - .check_apicid_present = es7000_check_apicid_present, - - .vector_allocation_domain = flat_vector_allocation_domain, - .init_apic_ldr = es7000_init_apic_ldr_cluster, - - .ioapic_phys_id_map = es7000_ioapic_phys_id_map, - .setup_apic_routing = es7000_setup_apic_routing, - .multi_timer_check = NULL, - .cpu_present_to_apicid = es7000_cpu_present_to_apicid, - .apicid_to_cpu_present = es7000_apicid_to_cpu_present, - .setup_portio_remap = NULL, - .check_phys_apicid_present = es7000_check_phys_apicid_present, - .enable_apic_mode = es7000_enable_apic_mode, - .phys_pkg_id = es7000_phys_pkg_id, - .mps_oem_check = es7000_mps_oem_check_cluster, - - .get_apic_id = es7000_get_apic_id, - .set_apic_id = NULL, - .apic_id_mask = 0xFF << 24, - - .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, - - .send_IPI_mask = es7000_send_IPI_mask, - .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = es7000_send_IPI_allbutself, - .send_IPI_all = es7000_send_IPI_all, - .send_IPI_self = default_send_IPI_self, - - .wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip, - - .trampoline_phys_low = 0x467, - .trampoline_phys_high = 0x469, - - .wait_for_init_deassert = false, - /* Nothing to do for most platforms, since cleared by the INIT cycle: */ - .smp_callin_clear_local_apic = NULL, - .inquire_remote_apic = default_inquire_remote_apic, - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - - .x86_32_early_logical_apicid = es7000_early_logical_apicid, -}; - -static struct apic __refdata apic_es7000 = { - - .name = "es7000", - .probe = probe_es7000, - .acpi_madt_oem_check = es7000_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = es7000_apic_id_registered, - - .irq_delivery_mode = dest_Fixed, - /* phys delivery to target CPUs: */ - .irq_dest_mode = 0, - - .target_cpus = es7000_target_cpus, - .disable_esr = 1, - .dest_logical = 0, - .check_apicid_used = es7000_check_apicid_used, - .check_apicid_present = es7000_check_apicid_present, - - .vector_allocation_domain = flat_vector_allocation_domain, - .init_apic_ldr = es7000_init_apic_ldr, - - .ioapic_phys_id_map = es7000_ioapic_phys_id_map, - .setup_apic_routing = es7000_setup_apic_routing, - .multi_timer_check = NULL, - .cpu_present_to_apicid = es7000_cpu_present_to_apicid, - .apicid_to_cpu_present = es7000_apicid_to_cpu_present, - .setup_portio_remap = NULL, - .check_phys_apicid_present = es7000_check_phys_apicid_present, - .enable_apic_mode = es7000_enable_apic_mode, - .phys_pkg_id = es7000_phys_pkg_id, - .mps_oem_check = es7000_mps_oem_check, - - .get_apic_id = es7000_get_apic_id, - .set_apic_id = NULL, - .apic_id_mask = 0xFF << 24, - - .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, - - .send_IPI_mask = es7000_send_IPI_mask, - .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = es7000_send_IPI_allbutself, - .send_IPI_all = es7000_send_IPI_all, - .send_IPI_self = default_send_IPI_self, - - .trampoline_phys_low = 0x467, - .trampoline_phys_high = 0x469, - - .wait_for_init_deassert = true, - /* Nothing to do for most platforms, since cleared by the INIT cycle: */ - .smp_callin_clear_local_apic = NULL, - .inquire_remote_apic = default_inquire_remote_apic, - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - - .x86_32_early_logical_apicid = es7000_early_logical_apicid, -}; - -/* - * Need to check for es7000 followed by es7000_cluster, so this order - * in apic_drivers is important. - */ -apic_drivers(apic_es7000, apic_es7000_cluster); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index a698d7165c9..eab67047dec 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -57,7 +57,7 @@ void arch_trigger_all_cpu_backtrace(void) } clear_bit(0, &backtrace_flag); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static int __kprobes diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6ad4658de70..992060e0989 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2189,7 +2189,7 @@ void send_cleanup_vector(struct irq_cfg *cfg) cfg->move_in_progress = 0; } -asmlinkage void smp_irq_move_cleanup_interrupt(void) +asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; @@ -3425,6 +3425,11 @@ int get_nr_irqs_gsi(void) return nr_irqs_gsi; } +unsigned int arch_dynirq_lower_bound(unsigned int from) +{ + return from < nr_irqs_gsi ? nr_irqs_gsi : from; +} + int __init arch_probe_nr_irqs(void) { int nr; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c deleted file mode 100644 index 030ea1c04f7..00000000000 --- a/arch/x86/kernel/apic/numaq_32.c +++ /dev/null @@ -1,524 +0,0 @@ -/* - * Written by: Patricia Gaughen, IBM Corporation - * - * Copyright (C) 2002, IBM Corp. - * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to <gone@us.ibm.com> - */ -#include <linux/nodemask.h> -#include <linux/topology.h> -#include <linux/bootmem.h> -#include <linux/memblock.h> -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <linux/kernel.h> -#include <linux/mmzone.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/init.h> -#include <linux/numa.h> -#include <linux/smp.h> -#include <linux/io.h> -#include <linux/mm.h> - -#include <asm/processor.h> -#include <asm/fixmap.h> -#include <asm/mpspec.h> -#include <asm/numaq.h> -#include <asm/setup.h> -#include <asm/apic.h> -#include <asm/e820.h> -#include <asm/ipi.h> - -int found_numaq; - -/* - * Have to match translation table entries to main table entries by counter - * hence the mpc_record variable .... can't see a less disgusting way of - * doing this .... - */ -struct mpc_trans { - unsigned char mpc_type; - unsigned char trans_len; - unsigned char trans_type; - unsigned char trans_quad; - unsigned char trans_global; - unsigned char trans_local; - unsigned short trans_reserved; -}; - -static int mpc_record; - -static struct mpc_trans *translation_table[MAX_MPC_ENTRY]; - -int mp_bus_id_to_node[MAX_MP_BUSSES]; -int mp_bus_id_to_local[MAX_MP_BUSSES]; -int quad_local_to_mp_bus_id[NR_CPUS/4][4]; - - -static inline void numaq_register_node(int node, struct sys_cfg_data *scd) -{ - struct eachquadmem *eq = scd->eq + node; - u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20; - u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20; - int ret; - - node_set(node, numa_nodes_parsed); - ret = numa_add_memblk(node, start, end); - BUG_ON(ret < 0); -} - -/* - * Function: smp_dump_qct() - * - * Description: gets memory layout from the quad config table. This - * function also updates numa_nodes_parsed with the nodes (quads) present. - */ -static void __init smp_dump_qct(void) -{ - struct sys_cfg_data *scd; - int node; - - scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR); - - for_each_node(node) { - if (scd->quads_present31_0 & (1 << node)) - numaq_register_node(node, scd); - } -} - -void numaq_tsc_disable(void) -{ - if (!found_numaq) - return; - - if (num_online_nodes() > 1) { - printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); - setup_clear_cpu_cap(X86_FEATURE_TSC); - } -} - -static void __init numaq_tsc_init(void) -{ - numaq_tsc_disable(); -} - -static inline int generate_logical_apicid(int quad, int phys_apicid) -{ - return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); -} - -/* x86_quirks member */ -static int mpc_apic_id(struct mpc_cpu *m) -{ - int quad = translation_table[mpc_record]->trans_quad; - int logical_apicid = generate_logical_apicid(quad, m->apicid); - - printk(KERN_DEBUG - "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", - m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8, - (m->cpufeature & CPU_MODEL_MASK) >> 4, - m->apicver, quad, logical_apicid); - - return logical_apicid; -} - -/* x86_quirks member */ -static void mpc_oem_bus_info(struct mpc_bus *m, char *name) -{ - int quad = translation_table[mpc_record]->trans_quad; - int local = translation_table[mpc_record]->trans_local; - - mp_bus_id_to_node[m->busid] = quad; - mp_bus_id_to_local[m->busid] = local; - - printk(KERN_INFO "Bus #%d is %s (node %d)\n", m->busid, name, quad); -} - -/* x86_quirks member */ -static void mpc_oem_pci_bus(struct mpc_bus *m) -{ - int quad = translation_table[mpc_record]->trans_quad; - int local = translation_table[mpc_record]->trans_local; - - quad_local_to_mp_bus_id[quad][local] = m->busid; -} - -/* - * Called from mpparse code. - * mode = 0: prescan - * mode = 1: one mpc entry scanned - */ -static void numaq_mpc_record(unsigned int mode) -{ - if (!mode) - mpc_record = 0; - else - mpc_record++; -} - -static void __init MP_translation_info(struct mpc_trans *m) -{ - printk(KERN_INFO - "Translation: record %d, type %d, quad %d, global %d, local %d\n", - mpc_record, m->trans_type, m->trans_quad, m->trans_global, - m->trans_local); - - if (mpc_record >= MAX_MPC_ENTRY) - printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); - else - translation_table[mpc_record] = m; /* stash this for later */ - - if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) - node_set_online(m->trans_quad); -} - -static int __init mpf_checksum(unsigned char *mp, int len) -{ - int sum = 0; - - while (len--) - sum += *mp++; - - return sum & 0xFF; -} - -/* - * Read/parse the MPC oem tables - */ -static void __init smp_read_mpc_oem(struct mpc_table *mpc) -{ - struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr; - int count = sizeof(*oemtable); /* the header size */ - unsigned char *oemptr = ((unsigned char *)oemtable) + count; - - mpc_record = 0; - printk(KERN_INFO - "Found an OEM MPC table at %8p - parsing it...\n", oemtable); - - if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { - printk(KERN_WARNING - "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", - oemtable->signature[0], oemtable->signature[1], - oemtable->signature[2], oemtable->signature[3]); - return; - } - - if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) { - printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); - return; - } - - while (count < oemtable->length) { - switch (*oemptr) { - case MP_TRANSLATION: - { - struct mpc_trans *m = (void *)oemptr; - - MP_translation_info(m); - oemptr += sizeof(*m); - count += sizeof(*m); - ++mpc_record; - break; - } - default: - printk(KERN_WARNING - "Unrecognised OEM table entry type! - %d\n", - (int)*oemptr); - return; - } - } -} - -static __init void early_check_numaq(void) -{ - /* - * get boot-time SMP configuration: - */ - if (smp_found_config) - early_get_smp_config(); - - if (found_numaq) { - x86_init.mpparse.mpc_record = numaq_mpc_record; - x86_init.mpparse.setup_ioapic_ids = x86_init_noop; - x86_init.mpparse.mpc_apic_id = mpc_apic_id; - x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; - x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; - x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; - x86_init.timers.tsc_pre_init = numaq_tsc_init; - x86_init.pci.init = pci_numaq_init; - } -} - -int __init numaq_numa_init(void) -{ - early_check_numaq(); - if (!found_numaq) - return -ENOENT; - smp_dump_qct(); - - return 0; -} - -#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER) - -static inline unsigned int numaq_get_apic_id(unsigned long x) -{ - return (x >> 24) & 0x0F; -} - -static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector) -{ - default_send_IPI_mask_sequence_logical(mask, vector); -} - -static inline void numaq_send_IPI_allbutself(int vector) -{ - default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector); -} - -static inline void numaq_send_IPI_all(int vector) -{ - numaq_send_IPI_mask(cpu_online_mask, vector); -} - -#define NUMAQ_TRAMPOLINE_PHYS_LOW (0x8) -#define NUMAQ_TRAMPOLINE_PHYS_HIGH (0xa) - -/* - * Because we use NMIs rather than the INIT-STARTUP sequence to - * bootstrap the CPUs, the APIC may be in a weird state. Kick it: - */ -static inline void numaq_smp_callin_clear_local_apic(void) -{ - clear_local_APIC(); -} - -static inline const struct cpumask *numaq_target_cpus(void) -{ - return cpu_all_mask; -} - -static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid) -{ - return physid_isset(apicid, *map); -} - -static inline unsigned long numaq_check_apicid_present(int bit) -{ - return physid_isset(bit, phys_cpu_present_map); -} - -static inline int numaq_apic_id_registered(void) -{ - return 1; -} - -static inline void numaq_init_apic_ldr(void) -{ - /* Already done in NUMA-Q firmware */ -} - -static inline void numaq_setup_apic_routing(void) -{ - printk(KERN_INFO - "Enabling APIC mode: NUMA-Q. Using %d I/O APICs\n", - nr_ioapics); -} - -/* - * Skip adding the timer int on secondary nodes, which causes - * a small but painful rift in the time-space continuum. - */ -static inline int numaq_multi_timer_check(int apic, int irq) -{ - return apic != 0 && irq == 0; -} - -static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) -{ - /* We don't have a good way to do this yet - hack */ - return physids_promote(0xFUL, retmap); -} - -/* - * Supporting over 60 cpus on NUMA-Q requires a locality-dependent - * cpu to APIC ID relation to properly interact with the intelligent - * mode of the cluster controller. - */ -static inline int numaq_cpu_present_to_apicid(int mps_cpu) -{ - if (mps_cpu < 60) - return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3)); - else - return BAD_APICID; -} - -static inline int numaq_apicid_to_node(int logical_apicid) -{ - return logical_apicid >> 4; -} - -static int numaq_numa_cpu_node(int cpu) -{ - int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - - if (logical_apicid != BAD_APICID) - return numaq_apicid_to_node(logical_apicid); - return NUMA_NO_NODE; -} - -static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) -{ - int node = numaq_apicid_to_node(logical_apicid); - int cpu = __ffs(logical_apicid & 0xf); - - physid_set_mask_of_physid(cpu + 4*node, retmap); -} - -/* Where the IO area was mapped on multiquad, always 0 otherwise */ -void *xquad_portio; - -static inline int numaq_check_phys_apicid_present(int phys_apicid) -{ - return 1; -} - -/* - * We use physical apicids here, not logical, so just return the default - * physical broadcast to stop people from breaking us - */ -static int -numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask, - unsigned int *apicid) -{ - *apicid = 0x0F; - return 0; -} - -/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */ -static inline int numaq_phys_pkg_id(int cpuid_apic, int index_msb) -{ - return cpuid_apic >> index_msb; -} - -static int -numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) -{ - if (strncmp(oem, "IBM NUMA", 8)) - printk(KERN_ERR "Warning! Not a NUMA-Q system!\n"); - else - found_numaq = 1; - - return found_numaq; -} - -static int probe_numaq(void) -{ - /* already know from get_memcfg_numaq() */ - return found_numaq; -} - -static void numaq_setup_portio_remap(void) -{ - int num_quads = num_online_nodes(); - - if (num_quads <= 1) - return; - - printk(KERN_INFO - "Remapping cross-quad port I/O for %d quads\n", num_quads); - - xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD); - - printk(KERN_INFO - "xquad_portio vaddr 0x%08lx, len %08lx\n", - (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); -} - -/* Use __refdata to keep false positive warning calm. */ -static struct apic __refdata apic_numaq = { - - .name = "NUMAQ", - .probe = probe_numaq, - .acpi_madt_oem_check = NULL, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = numaq_apic_id_registered, - - .irq_delivery_mode = dest_LowestPrio, - /* physical delivery on LOCAL quad: */ - .irq_dest_mode = 0, - - .target_cpus = numaq_target_cpus, - .disable_esr = 1, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = numaq_check_apicid_used, - .check_apicid_present = numaq_check_apicid_present, - - .vector_allocation_domain = flat_vector_allocation_domain, - .init_apic_ldr = numaq_init_apic_ldr, - - .ioapic_phys_id_map = numaq_ioapic_phys_id_map, - .setup_apic_routing = numaq_setup_apic_routing, - .multi_timer_check = numaq_multi_timer_check, - .cpu_present_to_apicid = numaq_cpu_present_to_apicid, - .apicid_to_cpu_present = numaq_apicid_to_cpu_present, - .setup_portio_remap = numaq_setup_portio_remap, - .check_phys_apicid_present = numaq_check_phys_apicid_present, - .enable_apic_mode = NULL, - .phys_pkg_id = numaq_phys_pkg_id, - .mps_oem_check = numaq_mps_oem_check, - - .get_apic_id = numaq_get_apic_id, - .set_apic_id = NULL, - .apic_id_mask = 0x0F << 24, - - .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and, - - .send_IPI_mask = numaq_send_IPI_mask, - .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = numaq_send_IPI_allbutself, - .send_IPI_all = numaq_send_IPI_all, - .send_IPI_self = default_send_IPI_self, - - .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi, - .trampoline_phys_low = NUMAQ_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH, - - /* We don't do anything here because we use NMI's to boot instead */ - .wait_for_init_deassert = false, - .smp_callin_clear_local_apic = numaq_smp_callin_clear_local_apic, - .inquire_remote_apic = NULL, - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - - .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, - .x86_32_numa_cpu_node = numaq_numa_cpu_node, -}; - -apic_driver(apic_numaq); diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c deleted file mode 100644 index b656128611c..00000000000 --- a/arch/x86/kernel/apic/summit_32.c +++ /dev/null @@ -1,550 +0,0 @@ -/* - * IBM Summit-Specific Code - * - * Written By: Matthew Dobson, IBM Corporation - * - * Copyright (c) 2003 IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to <colpatch@us.ibm.com> - * - */ - -#define pr_fmt(fmt) "summit: %s: " fmt, __func__ - -#include <linux/mm.h> -#include <asm/io.h> -#include <asm/bios_ebda.h> - -/* - * APIC driver for the IBM "Summit" chipset. - */ -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <asm/mpspec.h> -#include <asm/apic.h> -#include <asm/smp.h> -#include <asm/fixmap.h> -#include <asm/apicdef.h> -#include <asm/ipi.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/gfp.h> -#include <linux/smp.h> - -static unsigned summit_get_apic_id(unsigned long x) -{ - return (x >> 24) & 0xFF; -} - -static inline void summit_send_IPI_mask(const struct cpumask *mask, int vector) -{ - default_send_IPI_mask_sequence_logical(mask, vector); -} - -static void summit_send_IPI_allbutself(int vector) -{ - default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector); -} - -static void summit_send_IPI_all(int vector) -{ - summit_send_IPI_mask(cpu_online_mask, vector); -} - -#include <asm/tsc.h> - -extern int use_cyclone; - -#ifdef CONFIG_X86_SUMMIT_NUMA -static void setup_summit(void); -#else -static inline void setup_summit(void) {} -#endif - -static int summit_mps_oem_check(struct mpc_table *mpc, char *oem, - char *productid) -{ - if (!strncmp(oem, "IBM ENSW", 8) && - (!strncmp(productid, "VIGIL SMP", 9) - || !strncmp(productid, "EXA", 3) - || !strncmp(productid, "RUTHLESS SMP", 12))){ - mark_tsc_unstable("Summit based system"); - use_cyclone = 1; /*enable cyclone-timer*/ - setup_summit(); - return 1; - } - return 0; -} - -/* Hook from generic ACPI tables.c */ -static int summit_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - if (!strncmp(oem_id, "IBM", 3) && - (!strncmp(oem_table_id, "SERVIGIL", 8) - || !strncmp(oem_table_id, "EXA", 3))){ - mark_tsc_unstable("Summit based system"); - use_cyclone = 1; /*enable cyclone-timer*/ - setup_summit(); - return 1; - } - return 0; -} - -struct rio_table_hdr { - unsigned char version; /* Version number of this data structure */ - /* Version 3 adds chassis_num & WP_index */ - unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */ - unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */ -} __attribute__((packed)); - -struct scal_detail { - unsigned char node_id; /* Scalability Node ID */ - unsigned long CBAR; /* Address of 1MB register space */ - unsigned char port0node; /* Node ID port connected to: 0xFF=None */ - unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */ - unsigned char port1node; /* Node ID port connected to: 0xFF = None */ - unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */ - unsigned char port2node; /* Node ID port connected to: 0xFF = None */ - unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */ - unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */ -} __attribute__((packed)); - -struct rio_detail { - unsigned char node_id; /* RIO Node ID */ - unsigned long BBAR; /* Address of 1MB register space */ - unsigned char type; /* Type of device */ - unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/ - /* For CYC: Node ID of Twister that owns this CYC */ - unsigned char port0node; /* Node ID port connected to: 0xFF=None */ - unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */ - unsigned char port1node; /* Node ID port connected to: 0xFF=None */ - unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */ - unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */ - /* For CYC: 0 */ - unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */ - /* = 0 : the XAPIC is not used, ie:*/ - /* ints fwded to another XAPIC */ - /* Bits1:7 Reserved */ - /* For CYC: Bits0:7 Reserved */ - unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */ - /* lower slot numbers/PCI bus numbers */ - /* For CYC: No meaning */ - unsigned char chassis_num; /* 1 based Chassis number */ - /* For LookOut WPEGs this field indicates the */ - /* Expansion Chassis #, enumerated from Boot */ - /* Node WPEG external port, then Boot Node CYC */ - /* external port, then Next Vigil chassis WPEG */ - /* external port, etc. */ - /* Shared Lookouts have only 1 chassis number (the */ - /* first one assigned) */ -} __attribute__((packed)); - - -typedef enum { - CompatTwister = 0, /* Compatibility Twister */ - AltTwister = 1, /* Alternate Twister of internal 8-way */ - CompatCyclone = 2, /* Compatibility Cyclone */ - AltCyclone = 3, /* Alternate Cyclone of internal 8-way */ - CompatWPEG = 4, /* Compatibility WPEG */ - AltWPEG = 5, /* Second Planar WPEG */ - LookOutAWPEG = 6, /* LookOut WPEG */ - LookOutBWPEG = 7, /* LookOut WPEG */ -} node_type; - -static inline int is_WPEG(struct rio_detail *rio){ - return (rio->type == CompatWPEG || rio->type == AltWPEG || - rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); -} - -#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) - -static const struct cpumask *summit_target_cpus(void) -{ - /* CPU_MASK_ALL (0xff) has undefined behaviour with - * dest_LowestPrio mode logical clustered apic interrupt routing - * Just start on cpu 0. IRQ balancing will spread load - */ - return cpumask_of(0); -} - -static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid) -{ - return 0; -} - -/* we don't use the phys_cpu_present_map to indicate apicid presence */ -static unsigned long summit_check_apicid_present(int bit) -{ - return 1; -} - -static int summit_early_logical_apicid(int cpu) -{ - int count = 0; - u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu); - u8 my_cluster = APIC_CLUSTER(my_id); -#ifdef CONFIG_SMP - u8 lid; - int i; - - /* Create logical APIC IDs by counting CPUs already in cluster. */ - for (count = 0, i = nr_cpu_ids; --i >= 0; ) { - lid = early_per_cpu(x86_cpu_to_logical_apicid, i); - if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) - ++count; - } -#endif - /* We only have a 4 wide bitmap in cluster mode. If a deranged - * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ - BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); - return my_cluster | (1UL << count); -} - -static void summit_init_apic_ldr(void) -{ - int cpu = smp_processor_id(); - unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - unsigned long val; - - apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(id); - apic_write(APIC_LDR, val); -} - -static int summit_apic_id_registered(void) -{ - return 1; -} - -static void summit_setup_apic_routing(void) -{ - pr_info("Enabling APIC mode: Summit. Using %d I/O APICs\n", - nr_ioapics); -} - -static int summit_cpu_present_to_apicid(int mps_cpu) -{ - if (mps_cpu < nr_cpu_ids) - return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); - else - return BAD_APICID; -} - -static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap) -{ - /* For clustered we don't have a good way to do this yet - hack */ - physids_promote(0x0FL, retmap); -} - -static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap) -{ - physid_set_mask_of_physid(0, retmap); -} - -static int summit_check_phys_apicid_present(int physical_apicid) -{ - return 1; -} - -static inline int -summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) -{ - unsigned int round = 0; - unsigned int cpu, apicid = 0; - - /* - * The cpus in the mask must all be on the apic cluster. - */ - for_each_cpu_and(cpu, cpumask, cpu_online_mask) { - int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - - if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { - pr_err("Not a valid mask!\n"); - return -EINVAL; - } - apicid |= new_apicid; - round++; - } - if (!round) - return -EINVAL; - *dest_id = apicid; - return 0; -} - -static int -summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, - const struct cpumask *andmask, - unsigned int *apicid) -{ - cpumask_var_t cpumask; - *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); - - if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) - return 0; - - cpumask_and(cpumask, inmask, andmask); - summit_cpu_mask_to_apicid(cpumask, apicid); - - free_cpumask_var(cpumask); - - return 0; -} - -/* - * cpuid returns the value latched in the HW at reset, not the APIC ID - * register's value. For any box whose BIOS changes APIC IDs, like - * clustered APIC systems, we must use hard_smp_processor_id. - * - * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID. - */ -static int summit_phys_pkg_id(int cpuid_apic, int index_msb) -{ - return hard_smp_processor_id() >> index_msb; -} - -static int probe_summit(void) -{ - /* probed later in mptable/ACPI hooks */ - return 0; -} - -#ifdef CONFIG_X86_SUMMIT_NUMA -static struct rio_table_hdr *rio_table_hdr; -static struct scal_detail *scal_devs[MAX_NUMNODES]; -static struct rio_detail *rio_devs[MAX_NUMNODES*4]; - -#ifndef CONFIG_X86_NUMAQ -static int mp_bus_id_to_node[MAX_MP_BUSSES]; -#endif - -static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) -{ - int twister = 0, node = 0; - int i, bus, num_buses; - - for (i = 0; i < rio_table_hdr->num_rio_dev; i++) { - if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) { - twister = rio_devs[i]->owner_id; - break; - } - } - if (i == rio_table_hdr->num_rio_dev) { - pr_err("Couldn't find owner Cyclone for Winnipeg!\n"); - return last_bus; - } - - for (i = 0; i < rio_table_hdr->num_scal_dev; i++) { - if (scal_devs[i]->node_id == twister) { - node = scal_devs[i]->node_id; - break; - } - } - if (i == rio_table_hdr->num_scal_dev) { - pr_err("Couldn't find owner Twister for Cyclone!\n"); - return last_bus; - } - - switch (rio_devs[wpeg_num]->type) { - case CompatWPEG: - /* - * The Compatibility Winnipeg controls the 2 legacy buses, - * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case - * a PCI-PCI bridge card is used in either slot: total 5 buses. - */ - num_buses = 5; - break; - case AltWPEG: - /* - * The Alternate Winnipeg controls the 2 133MHz buses [1 slot - * each], their 2 "extra" buses, the 100MHz bus [2 slots] and - * the "extra" buses for each of those slots: total 7 buses. - */ - num_buses = 7; - break; - case LookOutAWPEG: - case LookOutBWPEG: - /* - * A Lookout Winnipeg controls 3 100MHz buses [2 slots each] - * & the "extra" buses for each of those slots: total 9 buses. - */ - num_buses = 9; - break; - default: - pr_info("Unsupported Winnipeg type!\n"); - return last_bus; - } - - for (bus = last_bus; bus < last_bus + num_buses; bus++) - mp_bus_id_to_node[bus] = node; - return bus; -} - -static int build_detail_arrays(void) -{ - unsigned long ptr; - int i, scal_detail_size, rio_detail_size; - - if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { - pr_warn("MAX_NUMNODES too low! Defined as %d, but system has %d nodes\n", - MAX_NUMNODES, rio_table_hdr->num_scal_dev); - return 0; - } - - switch (rio_table_hdr->version) { - default: - pr_warn("Invalid Rio Grande Table Version: %d\n", - rio_table_hdr->version); - return 0; - case 2: - scal_detail_size = 11; - rio_detail_size = 13; - break; - case 3: - scal_detail_size = 12; - rio_detail_size = 15; - break; - } - - ptr = (unsigned long)rio_table_hdr + 3; - for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size) - scal_devs[i] = (struct scal_detail *)ptr; - - for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size) - rio_devs[i] = (struct rio_detail *)ptr; - - return 1; -} - -void setup_summit(void) -{ - unsigned long ptr; - unsigned short offset; - int i, next_wpeg, next_bus = 0; - - /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */ - ptr = get_bios_ebda(); - ptr = (unsigned long)phys_to_virt(ptr); - - rio_table_hdr = NULL; - offset = 0x180; - while (offset) { - /* The block id is stored in the 2nd word */ - if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) { - /* set the pointer past the offset & block id */ - rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); - break; - } - /* The next offset is stored in the 1st word. 0 means no more */ - offset = *((unsigned short *)(ptr + offset)); - } - if (!rio_table_hdr) { - pr_err("Unable to locate Rio Grande Table in EBDA - bailing!\n"); - return; - } - - if (!build_detail_arrays()) - return; - - /* The first Winnipeg we're looking for has an index of 0 */ - next_wpeg = 0; - do { - for (i = 0; i < rio_table_hdr->num_rio_dev; i++) { - if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) { - /* It's the Winnipeg we're looking for! */ - next_bus = setup_pci_node_map_for_wpeg(i, next_bus); - next_wpeg++; - break; - } - } - /* - * If we go through all Rio devices and don't find one with - * the next index, it means we've found all the Winnipegs, - * and thus all the PCI buses. - */ - if (i == rio_table_hdr->num_rio_dev) - next_wpeg = 0; - } while (next_wpeg != 0); -} -#endif - -static struct apic apic_summit = { - - .name = "summit", - .probe = probe_summit, - .acpi_madt_oem_check = summit_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = summit_apic_id_registered, - - .irq_delivery_mode = dest_LowestPrio, - /* logical delivery broadcast to all CPUs: */ - .irq_dest_mode = 1, - - .target_cpus = summit_target_cpus, - .disable_esr = 1, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = summit_check_apicid_used, - .check_apicid_present = summit_check_apicid_present, - - .vector_allocation_domain = flat_vector_allocation_domain, - .init_apic_ldr = summit_init_apic_ldr, - - .ioapic_phys_id_map = summit_ioapic_phys_id_map, - .setup_apic_routing = summit_setup_apic_routing, - .multi_timer_check = NULL, - .cpu_present_to_apicid = summit_cpu_present_to_apicid, - .apicid_to_cpu_present = summit_apicid_to_cpu_present, - .setup_portio_remap = NULL, - .check_phys_apicid_present = summit_check_phys_apicid_present, - .enable_apic_mode = NULL, - .phys_pkg_id = summit_phys_pkg_id, - .mps_oem_check = summit_mps_oem_check, - - .get_apic_id = summit_get_apic_id, - .set_apic_id = NULL, - .apic_id_mask = 0xFF << 24, - - .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and, - - .send_IPI_mask = summit_send_IPI_mask, - .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = summit_send_IPI_allbutself, - .send_IPI_all = summit_send_IPI_all, - .send_IPI_self = default_send_IPI_self, - - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - - .wait_for_init_deassert = true, - .smp_callin_clear_local_apic = NULL, - .inquire_remote_apic = default_inquire_remote_apic, - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - - .x86_32_early_logical_apicid = summit_early_logical_apicid, -}; - -apic_driver(apic_summit); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 3ab03430211..f3a1f04ed4c 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -844,21 +844,10 @@ static int apm_do_idle(void) int polling; int err = 0; - polling = !!(current_thread_info()->status & TS_POLLING); - if (polling) { - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - } if (!need_resched()) { idled = 1; ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err); } - if (polling) - current_thread_info()->status |= TS_POLLING; if (!idled) return 0; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8e28bf2fc3e..a135239badb 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1025,7 +1025,8 @@ __setup("show_msr=", setup_show_msr); static __init int setup_noclflush(char *arg) { - setup_clear_cpu_cap(X86_FEATURE_CLFLSH); + setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); + setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT); return 1; } __setup("noclflush", setup_noclflush); @@ -1078,6 +1079,10 @@ static __init int setup_disablecpuid(char *arg) } __setup("clearcpuid=", setup_disablecpuid); +DEFINE_PER_CPU(unsigned long, kernel_stack) = + (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; +EXPORT_PER_CPU_SYMBOL(kernel_stack); + #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, @@ -1094,10 +1099,6 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(unsigned long, kernel_stack) = - (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; -EXPORT_PER_CPU_SYMBOL(kernel_stack); - DEFINE_PER_CPU(char *, irq_stack_ptr) = init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 897d6201ef1..a80029035bf 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -274,10 +274,6 @@ static void intel_workarounds(struct cpuinfo_x86 *c) } #endif -#ifdef CONFIG_X86_NUMAQ - numaq_tsc_disable(); -#endif - intel_smp_check(c); } #else diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 0641113e296..a952e9c85b6 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -1225,21 +1225,24 @@ static struct notifier_block cacheinfo_cpu_notifier = { static int __init cache_sysfs_init(void) { - int i; + int i, err = 0; if (num_cache_leaves == 0) return 0; + cpu_notifier_register_begin(); for_each_online_cpu(i) { - int err; struct device *dev = get_cpu_device(i); err = cache_add_dev(dev); if (err) - return err; + goto out; } - register_hotcpu_notifier(&cacheinfo_cpu_notifier); - return 0; + __register_hotcpu_notifier(&cacheinfo_cpu_notifier); + +out: + cpu_notifier_register_done(); + return err; } device_initcall(cache_sysfs_init); diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 36565373af8..afa9f0d487e 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -47,45 +47,3 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) return NULL; } EXPORT_SYMBOL(x86_match_cpu); - -ssize_t arch_print_cpu_modalias(struct device *dev, - struct device_attribute *attr, - char *bufptr) -{ - int size = PAGE_SIZE; - int i, n; - char *buf = bufptr; - - n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:" - "model:%04X:feature:", - boot_cpu_data.x86_vendor, - boot_cpu_data.x86, - boot_cpu_data.x86_model); - size -= n; - buf += n; - size -= 1; - for (i = 0; i < NCAPINTS*32; i++) { - if (boot_cpu_has(i)) { - n = snprintf(buf, size, ",%04X", i); - if (n >= size) { - WARN(1, "x86 features overflow page\n"); - break; - } - size -= n; - buf += n; - } - } - *buf++ = '\n'; - return buf - bufptr; -} - -int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env) -{ - char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (buf) { - arch_print_cpu_modalias(NULL, NULL, buf); - add_uevent_var(env, "MODALIAS=%s", buf); - kfree(buf); - } - return 0; -} diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4d5419b249d..6cc800381d1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -89,6 +89,9 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; +/* CMCI storm detection filter */ +static DEFINE_PER_CPU(unsigned long, mce_polled_error); + /* * MCA banks polled by the period polling timer for corrected events. * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). @@ -614,6 +617,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(m.status & MCI_STATUS_VAL)) continue; + this_cpu_write(mce_polled_error, 1); /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. @@ -700,8 +704,7 @@ static int mce_timed_out(u64 *t) if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { - /* CHECKME: Make panic default for 1 too? */ - if (mca_cfg.tolerant < 1) + if (mca_cfg.tolerant <= 1) mce_panic("Timeout synchronizing machine check over CPUs", NULL, NULL); cpu_missing = 1; @@ -1278,10 +1281,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval) static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; +static int cmc_error_seen(void) +{ + unsigned long *v = &__get_cpu_var(mce_polled_error); + + return test_and_clear_bit(0, v); +} + static void mce_timer_fn(unsigned long data) { struct timer_list *t = &__get_cpu_var(mce_timer); unsigned long iv; + int notify; WARN_ON(smp_processor_id() != data); @@ -1296,7 +1307,9 @@ static void mce_timer_fn(unsigned long data) * polling interval, otherwise increase the polling interval. */ iv = __this_cpu_read(mce_next_interval); - if (mce_notify_irq()) { + notify = mce_notify_irq(); + notify |= cmc_error_seen(); + if (notify) { iv = max(iv / 2, (unsigned long) HZ/100); } else { iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); @@ -2423,28 +2436,65 @@ static __init int mcheck_init_device(void) int err; int i = 0; - if (!mce_available(&boot_cpu_data)) - return -EIO; + if (!mce_available(&boot_cpu_data)) { + err = -EIO; + goto err_out; + } - zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); + if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { + err = -ENOMEM; + goto err_out; + } mce_init_banks(); err = subsys_system_register(&mce_subsys, NULL); if (err) - return err; + goto err_out_mem; + cpu_notifier_register_begin(); for_each_online_cpu(i) { err = mce_device_create(i); - if (err) - return err; + if (err) { + cpu_notifier_register_done(); + goto err_device_create; + } } + __register_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); + register_syscore_ops(&mce_syscore_ops); - register_hotcpu_notifier(&mce_cpu_notifier); /* register character device /dev/mcelog */ - misc_register(&mce_chrdev_device); + err = misc_register(&mce_chrdev_device); + if (err) + goto err_register; + + return 0; + +err_register: + unregister_syscore_ops(&mce_syscore_ops); + + cpu_notifier_register_begin(); + __unregister_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); + +err_device_create: + /* + * We didn't keep track of which devices were created above, but + * even if we had, the set of online cpus might have changed. + * Play safe and remove for every possible cpu, since + * mce_device_remove() will do the right thing. + */ + for_each_possible_cpu(i) + mce_device_remove(i); + +err_out_mem: + free_cpumask_var(mce_device_initialized); + +err_out: + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); return err; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index fb6156fee6f..9a316b21df8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -9,6 +9,7 @@ #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/sched.h> +#include <linux/cpumask.h> #include <asm/apic.h> #include <asm/processor.h> #include <asm/msr.h> @@ -41,7 +42,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. */ -static DEFINE_RAW_SPINLOCK(cmci_discover_lock); +static DEFINE_SPINLOCK(cmci_discover_lock); #define CMCI_THRESHOLD 1 #define CMCI_POLL_INTERVAL (30 * HZ) @@ -137,6 +138,22 @@ unsigned long mce_intel_adjust_timer(unsigned long interval) } } +static void cmci_storm_disable_banks(void) +{ + unsigned long flags, *owned; + int bank; + u64 val; + + spin_lock_irqsave(&cmci_discover_lock, flags); + owned = __get_cpu_var(mce_banks_owned); + for_each_set_bit(bank, owned, MAX_NR_BANKS) { + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + } + spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + static bool cmci_storm_detect(void) { unsigned int cnt = __this_cpu_read(cmci_storm_cnt); @@ -158,7 +175,7 @@ static bool cmci_storm_detect(void) if (cnt <= CMCI_STORM_THRESHOLD) return false; - cmci_clear(); + cmci_storm_disable_banks(); __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); r = atomic_add_return(1, &cmci_storm_on_cpus); mce_timer_kick(CMCI_POLL_INTERVAL); @@ -194,7 +211,7 @@ static void cmci_discover(int banks) int i; int bios_wrong_thresh = 0; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; int bios_zero_thresh = 0; @@ -249,7 +266,7 @@ static void cmci_discover(int banks) WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { pr_info_once( "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); @@ -299,10 +316,10 @@ void cmci_clear(void) if (!cmci_supported(&banks)) return; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) __cmci_disable_bank(i); - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void cmci_rediscover_work_func(void *arg) @@ -343,9 +360,9 @@ void cmci_disable_bank(int bank) if (!cmci_supported(&banks)) return; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); __cmci_disable_bank(bank); - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void intel_init_cmci(void) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 3eec7de76ef..36a1bb6d1ee 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -271,9 +271,6 @@ static void thermal_throttle_remove_dev(struct device *dev) sysfs_remove_group(&dev->kobj, &thermal_attr_group); } -/* Mutex protecting device creation against CPU hotplug: */ -static DEFINE_MUTEX(therm_cpu_lock); - /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int thermal_throttle_cpu_callback(struct notifier_block *nfb, @@ -289,18 +286,14 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - mutex_lock(&therm_cpu_lock); err = thermal_throttle_add_dev(dev, cpu); - mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - mutex_lock(&therm_cpu_lock); thermal_throttle_remove_dev(dev); - mutex_unlock(&therm_cpu_lock); break; } return notifier_from_errno(err); @@ -319,19 +312,16 @@ static __init int thermal_throttle_init_device(void) if (!atomic_read(&therm_throt_en)) return 0; - register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + cpu_notifier_register_begin(); -#ifdef CONFIG_HOTPLUG_CPU - mutex_lock(&therm_cpu_lock); -#endif /* connect live CPUs to sysfs */ for_each_online_cpu(cpu) { err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); WARN_ON(err); } -#ifdef CONFIG_HOTPLUG_CPU - mutex_unlock(&therm_cpu_lock); -#endif + + __register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + cpu_notifier_register_done(); return 0; } @@ -439,14 +429,14 @@ static inline void __smp_thermal_interrupt(void) smp_thermal_vector(); } -asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs) { entering_irq(); __smp_thermal_interrupt(); exiting_ack_irq(); } -asmlinkage void smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs) { entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index fe6b1c86645..7245980186e 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -24,14 +24,14 @@ static inline void __smp_threshold_interrupt(void) mce_threshold_vector(); } -asmlinkage void smp_threshold_interrupt(void) +asmlinkage __visible void smp_threshold_interrupt(void) { entering_irq(); __smp_threshold_interrupt(); exiting_ack_irq(); } -asmlinkage void smp_trace_threshold_interrupt(void) +asmlinkage __visible void smp_trace_threshold_interrupt(void) { entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 15c987698b0..dd9d6190b08 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -97,6 +97,9 @@ MODULE_LICENSE("GPL"); static struct microcode_ops *microcode_ops; +bool dis_ucode_ldr; +module_param(dis_ucode_ldr, bool, 0); + /* * Synchronization. * @@ -546,6 +549,9 @@ static int __init microcode_init(void) struct cpuinfo_x86 *c = &cpu_data(0); int error; + if (dis_ucode_ldr) + return 0; + if (c->x86_vendor == X86_VENDOR_INTEL) microcode_ops = init_intel_microcode(); else if (c->x86_vendor == X86_VENDOR_AMD) diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index be7f8514f57..5f28a64e71e 100644 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -17,9 +17,11 @@ * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> +#include <asm/microcode.h> #include <asm/microcode_intel.h> #include <asm/microcode_amd.h> #include <asm/processor.h> +#include <asm/cmdline.h> #define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) #define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') @@ -72,10 +74,33 @@ static int x86_family(void) return x86; } +static bool __init check_loader_disabled_bsp(void) +{ +#ifdef CONFIG_X86_32 + const char *cmdline = (const char *)__pa_nodebug(boot_command_line); + const char *opt = "dis_ucode_ldr"; + const char *option = (const char *)__pa_nodebug(opt); + bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); + +#else /* CONFIG_X86_64 */ + const char *cmdline = boot_command_line; + const char *option = "dis_ucode_ldr"; + bool *res = &dis_ucode_ldr; +#endif + + if (cmdline_find_option_bool(cmdline, option)) + *res = true; + + return *res; +} + void __init load_ucode_bsp(void) { int vendor, x86; + if (check_loader_disabled_bsp()) + return; + if (!have_cpuid_p()) return; @@ -96,10 +121,22 @@ void __init load_ucode_bsp(void) } } +static bool check_loader_disabled_ap(void) +{ +#ifdef CONFIG_X86_32 + return __pa_nodebug(dis_ucode_ldr); +#else + return dis_ucode_ldr; +#endif +} + void load_ucode_ap(void) { int vendor, x86; + if (check_loader_disabled_ap()) + return; + if (!have_cpuid_p()) return; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 832d05a914b..76f98fe5b35 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -17,6 +17,7 @@ #include <linux/hardirq.h> #include <linux/efi.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <asm/processor.h> #include <asm/hypervisor.h> #include <asm/hyperv.h> @@ -31,6 +32,45 @@ struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); +#if IS_ENABLED(CONFIG_HYPERV) +static void (*vmbus_handler)(void); + +void hyperv_vector_handler(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + irq_enter(); + exit_idle(); + + inc_irq_stat(irq_hv_callback_count); + if (vmbus_handler) + vmbus_handler(); + + irq_exit(); + set_irq_regs(old_regs); +} + +void hv_setup_vmbus_irq(void (*handler)(void)) +{ + vmbus_handler = handler; + /* + * Setup the IDT for hypervisor callback. Prevent reallocation + * at module reload. + */ + if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, + hyperv_callback_vector); +} + +void hv_remove_vmbus_irq(void) +{ + /* We have no way to deallocate the interrupt gate */ + vmbus_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq); +EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); +#endif + static uint32_t __init ms_hyperv_platform(void) { u32 eax; @@ -119,41 +159,3 @@ const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .init_platform = ms_hyperv_init_platform, }; EXPORT_SYMBOL(x86_hyper_ms_hyperv); - -#if IS_ENABLED(CONFIG_HYPERV) -static int vmbus_irq = -1; -static irq_handler_t vmbus_isr; - -void hv_register_vmbus_handler(int irq, irq_handler_t handler) -{ - /* - * Setup the IDT for hypervisor callback. - */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); - - vmbus_irq = irq; - vmbus_isr = handler; -} - -void hyperv_vector_handler(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - struct irq_desc *desc; - - irq_enter(); - exit_idle(); - - desc = irq_to_desc(vmbus_irq); - - if (desc) - generic_handle_irq_desc(vmbus_irq, desc); - - irq_exit(); - set_irq_regs(old_regs); -} -#else -void hv_register_vmbus_handler(int irq, irq_handler_t handler) -{ -} -#endif -EXPORT_SYMBOL_GPL(hv_register_vmbus_handler); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ae407f7226c..89f3b7c1af2 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -721,6 +721,7 @@ int perf_assign_events(struct perf_event **events, int n, return sched.state.unassigned; } +EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 4b8e4d3cd6e..4c36bbe3173 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -926,13 +926,13 @@ static __init int amd_ibs_init(void) goto out; perf_ibs_pm_init(); - get_online_cpus(); + cpu_notifier_register_begin(); ibs_caps = caps; /* make ibs_caps visible to other cpus: */ smp_mb(); - perf_cpu_notifier(perf_ibs_cpu_notifier); smp_call_function(setup_APIC_ibs, NULL, 1); - put_online_cpus(); + __perf_cpu_notifier(perf_ibs_cpu_notifier); + cpu_notifier_register_done(); ret = perf_event_ibs_init(); out: diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c index 754291adec3..3bbdf4cd38b 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -531,15 +531,16 @@ static int __init amd_uncore_init(void) if (ret) return -ENODEV; - get_online_cpus(); + cpu_notifier_register_begin(); + /* init cpus already online before registering for hotplug notifier */ for_each_online_cpu(cpu) { amd_uncore_cpu_up_prepare(cpu); smp_call_function_single(cpu, init_cpu_already_online, NULL, 1); } - register_cpu_notifier(&amd_uncore_cpu_notifier_block); - put_online_cpus(); + __register_cpu_notifier(&amd_uncore_cpu_notifier_block); + cpu_notifier_register_done(); return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index aa333d96688..adb02aa62af 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -169,7 +169,6 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ EVENT_CONSTRAINT_END }; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index ae96cfa5edd..980970cb744 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -108,15 +108,31 @@ static u64 precise_store_data(u64 status) return val; } -static u64 precise_store_data_hsw(u64 status) +static u64 precise_store_data_hsw(struct perf_event *event, u64 status) { union perf_mem_data_src dse; + u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK; dse.val = 0; dse.mem_op = PERF_MEM_OP_STORE; dse.mem_lvl = PERF_MEM_LVL_NA; + + /* + * L1 info only valid for following events: + * + * MEM_UOPS_RETIRED.STLB_MISS_STORES + * MEM_UOPS_RETIRED.LOCK_STORES + * MEM_UOPS_RETIRED.SPLIT_STORES + * MEM_UOPS_RETIRED.ALL_STORES + */ + if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0) + return dse.mem_lvl; + if (status & 1) - dse.mem_lvl = PERF_MEM_LVL_L1; + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + else + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; + /* Nothing else supported. Sorry. */ return dse.val; } @@ -887,7 +903,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, data.data_src.val = load_latency_data(pebs->dse); else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) data.data_src.val = - precise_store_data_hsw(pebs->dse); + precise_store_data_hsw(event, pebs->dse); else data.data_src.val = precise_store_data(pebs->dse); } diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 5ad35ad94d0..619f7699487 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -59,7 +59,7 @@ #define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ #define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ #define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ -#define RAPL_IDX_PP1_NRG_STAT 3 /* DRAM */ +#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ /* Clients have PP0, PKG */ @@ -72,6 +72,12 @@ 1<<RAPL_IDX_PKG_NRG_STAT|\ 1<<RAPL_IDX_RAM_NRG_STAT) +/* Servers have PP0, PKG, RAM, PP1 */ +#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\ + 1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_RAM_NRG_STAT|\ + 1<<RAPL_IDX_PP1_NRG_STAT) + /* * event code: LSB 8 bits, passed in attr->config * any other bit is reserved @@ -425,6 +431,24 @@ static struct attribute *rapl_events_cln_attr[] = { NULL, }; +static struct attribute *rapl_events_hsw_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_gpu), + EVENT_PTR(rapl_ram), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_gpu_unit), + EVENT_PTR(rapl_ram_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_gpu_scale), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + static struct attribute_group rapl_pmu_events_group = { .name = "events", .attrs = NULL, /* patched at runtime */ @@ -511,6 +535,7 @@ static int rapl_cpu_prepare(int cpu) struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); int phys_id = topology_physical_package_id(cpu); u64 ms; + u64 msr_rapl_power_unit_bits; if (pmu) return 0; @@ -518,6 +543,10 @@ static int rapl_cpu_prepare(int cpu) if (phys_id < 0) return -1; + /* protect rdmsrl() to handle virtualization */ + if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) + return -1; + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); if (!pmu) return -1; @@ -531,8 +560,7 @@ static int rapl_cpu_prepare(int cpu) * * we cache in local PMU instance */ - rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit); - pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL; + pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; pmu->pmu = &rapl_pmu_class; /* @@ -631,11 +659,14 @@ static int __init rapl_pmu_init(void) switch (boot_cpu_data.x86_model) { case 42: /* Sandy Bridge */ case 58: /* Ivy Bridge */ - case 60: /* Haswell */ - case 69: /* Haswell-Celeron */ rapl_cntr_mask = RAPL_IDX_CLN; rapl_pmu_events_group.attrs = rapl_events_cln_attr; break; + case 60: /* Haswell */ + case 69: /* Haswell-Celeron */ + rapl_cntr_mask = RAPL_IDX_HSW; + rapl_pmu_events_group.attrs = rapl_events_hsw_attr; + break; case 45: /* Sandy Bridge-EP */ case 62: /* IvyTown */ rapl_cntr_mask = RAPL_IDX_SRV; @@ -646,19 +677,22 @@ static int __init rapl_pmu_init(void) /* unsupported */ return 0; } - get_online_cpus(); + + cpu_notifier_register_begin(); for_each_online_cpu(cpu) { - rapl_cpu_prepare(cpu); + ret = rapl_cpu_prepare(cpu); + if (ret) + goto out; rapl_cpu_init(cpu); } - perf_cpu_notifier(rapl_cpu_notifier); + __perf_cpu_notifier(rapl_cpu_notifier); ret = perf_pmu_register(&rapl_pmu_class, "power", -1); if (WARN_ON(ret)) { pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); - put_online_cpus(); + cpu_notifier_register_done(); return -1; } @@ -672,7 +706,8 @@ static int __init rapl_pmu_init(void) hweight32(rapl_cntr_mask), ktime_to_ms(pmu->timer_interval)); - put_online_cpus(); +out: + cpu_notifier_register_done(); return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index bd2253d40cf..65bbbea38b9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -4244,7 +4244,7 @@ static void __init uncore_cpumask_init(void) if (!cpumask_empty(&uncore_cpu_mask)) return; - get_online_cpus(); + cpu_notifier_register_begin(); for_each_online_cpu(cpu) { int i, phys_id = topology_physical_package_id(cpu); @@ -4263,9 +4263,9 @@ static void __init uncore_cpumask_init(void) } on_each_cpu(uncore_cpu_setup, NULL, 1); - register_cpu_notifier(&uncore_cpu_nb); + __register_cpu_notifier(&uncore_cpu_nb); - put_online_cpus(); + cpu_notifier_register_done(); } diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 384df5105fb..136ac74dee8 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -27,6 +27,7 @@ static int __init x86_rdrand_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_RDRAND); + setup_clear_cpu_cap(X86_FEATURE_RDSEED); return 1; } __setup("nordrand", x86_rdrand_setup); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 7d9481c743f..3225ae6c518 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -198,14 +198,15 @@ static int __init cpuid_init(void) goto out_chrdev; } cpuid_class->devnode = cpuid_devnode; - get_online_cpus(); + + cpu_notifier_register_begin(); for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) goto out_class; } - register_hotcpu_notifier(&cpuid_class_cpu_notifier); - put_online_cpus(); + __register_hotcpu_notifier(&cpuid_class_cpu_notifier); + cpu_notifier_register_done(); err = 0; goto out; @@ -215,7 +216,7 @@ out_class: for_each_online_cpu(i) { cpuid_device_destroy(i); } - put_online_cpus(); + cpu_notifier_register_done(); class_destroy(cpuid_class); out_chrdev: __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); @@ -227,13 +228,13 @@ static void __exit cpuid_exit(void) { int cpu = 0; - get_online_cpus(); + cpu_notifier_register_begin(); for_each_online_cpu(cpu) cpuid_device_destroy(cpu); class_destroy(cpuid_class); __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); - unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); - put_online_cpus(); + __unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); + cpu_notifier_register_done(); } module_init(cpuid_init); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index a21d49c071d..5abd4cd4230 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,12 +16,35 @@ #include <asm/stacktrace.h> +static void *is_irq_stack(void *p, void *irq) +{ + if (p < irq || p >= (irq + THREAD_SIZE)) + return NULL; + return irq + THREAD_SIZE; +} + + +static void *is_hardirq_stack(unsigned long *stack, int cpu) +{ + void *irq = per_cpu(hardirq_stack, cpu); + + return is_irq_stack(stack, irq); +} + +static void *is_softirq_stack(unsigned long *stack, int cpu) +{ + void *irq = per_cpu(softirq_stack, cpu); + + return is_irq_stack(stack, irq); +} void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { + const unsigned cpu = get_cpu(); int graph = 0; + u32 *prev_esp; if (!task) task = current; @@ -39,18 +62,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, for (;;) { struct thread_info *context; + void *end_stack; + + end_stack = is_hardirq_stack(stack, cpu); + if (!end_stack) + end_stack = is_softirq_stack(stack, cpu); - context = (struct thread_info *) - ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph); + context = task_thread_info(task); + bp = ops->walk_stack(context, stack, bp, ops, data, + end_stack, &graph); - stack = (unsigned long *)context->previous_esp; + /* Stop if not on irq stack */ + if (!end_stack) + break; + + /* The previous esp is saved on the bottom of the stack */ + prev_esp = (u32 *)(end_stack - THREAD_SIZE); + stack = (unsigned long *)*prev_esp; if (!stack) break; + if (ops->stack(data, "IRQ") < 0) break; touch_nmi_watchdog(); } + put_cpu(); } EXPORT_SYMBOL(dump_trace); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index addb207dab9..1abcb50b48a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -104,6 +104,44 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack, return (stack >= irq_stack && stack < irq_stack_end); } +static const unsigned long irq_stack_size = + (IRQ_STACK_SIZE - 64) / sizeof(unsigned long); + +enum stack_type { + STACK_IS_UNKNOWN, + STACK_IS_NORMAL, + STACK_IS_EXCEPTION, + STACK_IS_IRQ, +}; + +static enum stack_type +analyze_stack(int cpu, struct task_struct *task, unsigned long *stack, + unsigned long **stack_end, unsigned long *irq_stack, + unsigned *used, char **id) +{ + unsigned long addr; + + addr = ((unsigned long)stack & (~(THREAD_SIZE - 1))); + if ((unsigned long)task_stack_page(task) == addr) + return STACK_IS_NORMAL; + + *stack_end = in_exception_stack(cpu, (unsigned long)stack, + used, id); + if (*stack_end) + return STACK_IS_EXCEPTION; + + if (!irq_stack) + return STACK_IS_NORMAL; + + *stack_end = irq_stack; + irq_stack = irq_stack - irq_stack_size; + + if (in_irq_stack(stack, irq_stack, *stack_end)) + return STACK_IS_IRQ; + + return STACK_IS_UNKNOWN; +} + /* * x86-64 can have up to three kernel stacks: * process stack @@ -116,12 +154,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); - unsigned long *irq_stack_end = - (unsigned long *)per_cpu(irq_stack_ptr, cpu); - unsigned used = 0; struct thread_info *tinfo; - int graph = 0; + unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu); unsigned long dummy; + unsigned used = 0; + int graph = 0; + int done = 0; if (!task) task = current; @@ -143,49 +181,61 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * exceptions */ tinfo = task_thread_info(task); - for (;;) { + while (!done) { + unsigned long *stack_end; + enum stack_type stype; char *id; - unsigned long *estack_end; - estack_end = in_exception_stack(cpu, (unsigned long)stack, - &used, &id); - if (estack_end) { + stype = analyze_stack(cpu, task, stack, &stack_end, + irq_stack, &used, &id); + + /* Default finish unless specified to continue */ + done = 1; + + switch (stype) { + + /* Break out early if we are on the thread stack */ + case STACK_IS_NORMAL: + break; + + case STACK_IS_EXCEPTION: + if (ops->stack(data, id) < 0) break; bp = ops->walk_stack(tinfo, stack, bp, ops, - data, estack_end, &graph); + data, stack_end, &graph); ops->stack(data, "<EOE>"); /* * We link to the next stack via the * second-to-last pointer (index -2 to end) in the * exception stack: */ - stack = (unsigned long *) estack_end[-2]; - continue; - } - if (irq_stack_end) { - unsigned long *irq_stack; - irq_stack = irq_stack_end - - (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); - - if (in_irq_stack(stack, irq_stack, irq_stack_end)) { - if (ops->stack(data, "IRQ") < 0) - break; - bp = ops->walk_stack(tinfo, stack, bp, - ops, data, irq_stack_end, &graph); - /* - * We link to the next stack (which would be - * the process stack normally) the last - * pointer (index -1 to end) in the IRQ stack: - */ - stack = (unsigned long *) (irq_stack_end[-1]); - irq_stack_end = NULL; - ops->stack(data, "EOI"); - continue; - } + stack = (unsigned long *) stack_end[-2]; + done = 0; + break; + + case STACK_IS_IRQ: + + if (ops->stack(data, "IRQ") < 0) + break; + bp = ops->walk_stack(tinfo, stack, bp, + ops, data, stack_end, &graph); + /* + * We link to the next stack (which would be + * the process stack normally) the last + * pointer (index -1 to end) in the IRQ stack: + */ + stack = (unsigned long *) (stack_end[-1]); + irq_stack = NULL; + ops->stack(data, "EOI"); + done = 0; + break; + + case STACK_IS_UNKNOWN: + ops->stack(data, "UNK"); + break; } - break; } /* diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index bc4a088f902..6cda0baeac9 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -17,6 +17,7 @@ #include <asm/dma.h> #include <asm/io_apic.h> #include <asm/apic.h> +#include <asm/hpet.h> #include <asm/iommu.h> #include <asm/gart.h> #include <asm/irq_remapping.h> @@ -203,18 +204,15 @@ static void __init intel_remapping_check(int num, int slot, int func) revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID); /* - * Revision 13 of all triggering devices id in this quirk have - * a problem draining interrupts when irq remapping is enabled, - * and should be flagged as broken. Additionally revisions 0x12 - * and 0x22 of device id 0x3405 has this problem. + * Revision <= 13 of all triggering devices id in this quirk + * have a problem draining interrupts when irq remapping is + * enabled, and should be flagged as broken. Additionally + * revision 0x22 of device id 0x3405 has this problem. */ - if (revision == 0x13) + if (revision <= 0x13) set_irq_remapping_broken(); - else if ((device == 0x3405) && - ((revision == 0x12) || - (revision == 0x22))) + else if (device == 0x3405 && revision == 0x22) set_irq_remapping_broken(); - } /* @@ -228,7 +226,7 @@ static void __init intel_remapping_check(int num, int slot, int func) * * And yes, so far on current devices the base addr is always under 4G. */ -static u32 __init intel_stolen_base(int num, int slot, int func) +static u32 __init intel_stolen_base(int num, int slot, int func, size_t stolen_size) { u32 base; @@ -243,10 +241,118 @@ static u32 __init intel_stolen_base(int num, int slot, int func) return base; } -#define KB(x) ((x) * 1024) +#define KB(x) ((x) * 1024UL) #define MB(x) (KB (KB (x))) #define GB(x) (MB (KB (x))) +static size_t __init i830_tseg_size(void) +{ + u8 tmp = read_pci_config_byte(0, 0, 0, I830_ESMRAMC); + + if (!(tmp & TSEG_ENABLE)) + return 0; + + if (tmp & I830_TSEG_SIZE_1M) + return MB(1); + else + return KB(512); +} + +static size_t __init i845_tseg_size(void) +{ + u8 tmp = read_pci_config_byte(0, 0, 0, I845_ESMRAMC); + + if (!(tmp & TSEG_ENABLE)) + return 0; + + switch (tmp & I845_TSEG_SIZE_MASK) { + case I845_TSEG_SIZE_512K: + return KB(512); + case I845_TSEG_SIZE_1M: + return MB(1); + default: + WARN_ON(1); + return 0; + } +} + +static size_t __init i85x_tseg_size(void) +{ + u8 tmp = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC); + + if (!(tmp & TSEG_ENABLE)) + return 0; + + return MB(1); +} + +static size_t __init i830_mem_size(void) +{ + return read_pci_config_byte(0, 0, 0, I830_DRB3) * MB(32); +} + +static size_t __init i85x_mem_size(void) +{ + return read_pci_config_byte(0, 0, 1, I85X_DRB3) * MB(32); +} + +/* + * On 830/845/85x the stolen memory base isn't available in any + * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size. + */ +static u32 __init i830_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + return i830_mem_size() - i830_tseg_size() - stolen_size; +} + +static u32 __init i845_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + return i830_mem_size() - i845_tseg_size() - stolen_size; +} + +static u32 __init i85x_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + return i85x_mem_size() - i85x_tseg_size() - stolen_size; +} + +static u32 __init i865_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + /* + * FIXME is the graphics stolen memory region + * always at TOUD? Ie. is it always the last + * one to be allocated by the BIOS? + */ + return read_pci_config_16(0, 0, 0, I865_TOUD) << 16; +} + +static size_t __init i830_stolen_size(int num, int slot, int func) +{ + size_t stolen_size; + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); + + switch (gmch_ctrl & I830_GMCH_GMS_MASK) { + case I830_GMCH_GMS_STOLEN_512: + stolen_size = KB(512); + break; + case I830_GMCH_GMS_STOLEN_1024: + stolen_size = MB(1); + break; + case I830_GMCH_GMS_STOLEN_8192: + stolen_size = MB(8); + break; + case I830_GMCH_GMS_LOCAL: + /* local memory isn't part of the normal address space */ + stolen_size = 0; + break; + default: + return 0; + } + + return stolen_size; +} + static size_t __init gen3_stolen_size(int num, int slot, int func) { size_t stolen_size; @@ -313,7 +419,7 @@ static size_t __init gen6_stolen_size(int num, int slot, int func) return gmch_ctrl << 25; /* 32 MB units */ } -static inline size_t gen8_stolen_size(int num, int slot, int func) +static size_t gen8_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; @@ -323,31 +429,74 @@ static inline size_t gen8_stolen_size(int num, int slot, int func) return gmch_ctrl << 25; /* 32 MB units */ } -typedef size_t (*stolen_size_fn)(int num, int slot, int func); + +struct intel_stolen_funcs { + size_t (*size)(int num, int slot, int func); + u32 (*base)(int num, int slot, int func, size_t size); +}; + +static const struct intel_stolen_funcs i830_stolen_funcs = { + .base = i830_stolen_base, + .size = i830_stolen_size, +}; + +static const struct intel_stolen_funcs i845_stolen_funcs = { + .base = i845_stolen_base, + .size = i830_stolen_size, +}; + +static const struct intel_stolen_funcs i85x_stolen_funcs = { + .base = i85x_stolen_base, + .size = gen3_stolen_size, +}; + +static const struct intel_stolen_funcs i865_stolen_funcs = { + .base = i865_stolen_base, + .size = gen3_stolen_size, +}; + +static const struct intel_stolen_funcs gen3_stolen_funcs = { + .base = intel_stolen_base, + .size = gen3_stolen_size, +}; + +static const struct intel_stolen_funcs gen6_stolen_funcs = { + .base = intel_stolen_base, + .size = gen6_stolen_size, +}; + +static const struct intel_stolen_funcs gen8_stolen_funcs = { + .base = intel_stolen_base, + .size = gen8_stolen_size, +}; static struct pci_device_id intel_stolen_ids[] __initdata = { - INTEL_I915G_IDS(gen3_stolen_size), - INTEL_I915GM_IDS(gen3_stolen_size), - INTEL_I945G_IDS(gen3_stolen_size), - INTEL_I945GM_IDS(gen3_stolen_size), - INTEL_VLV_M_IDS(gen6_stolen_size), - INTEL_VLV_D_IDS(gen6_stolen_size), - INTEL_PINEVIEW_IDS(gen3_stolen_size), - INTEL_I965G_IDS(gen3_stolen_size), - INTEL_G33_IDS(gen3_stolen_size), - INTEL_I965GM_IDS(gen3_stolen_size), - INTEL_GM45_IDS(gen3_stolen_size), - INTEL_G45_IDS(gen3_stolen_size), - INTEL_IRONLAKE_D_IDS(gen3_stolen_size), - INTEL_IRONLAKE_M_IDS(gen3_stolen_size), - INTEL_SNB_D_IDS(gen6_stolen_size), - INTEL_SNB_M_IDS(gen6_stolen_size), - INTEL_IVB_M_IDS(gen6_stolen_size), - INTEL_IVB_D_IDS(gen6_stolen_size), - INTEL_HSW_D_IDS(gen6_stolen_size), - INTEL_HSW_M_IDS(gen6_stolen_size), - INTEL_BDW_M_IDS(gen8_stolen_size), - INTEL_BDW_D_IDS(gen8_stolen_size) + INTEL_I830_IDS(&i830_stolen_funcs), + INTEL_I845G_IDS(&i845_stolen_funcs), + INTEL_I85X_IDS(&i85x_stolen_funcs), + INTEL_I865G_IDS(&i865_stolen_funcs), + INTEL_I915G_IDS(&gen3_stolen_funcs), + INTEL_I915GM_IDS(&gen3_stolen_funcs), + INTEL_I945G_IDS(&gen3_stolen_funcs), + INTEL_I945GM_IDS(&gen3_stolen_funcs), + INTEL_VLV_M_IDS(&gen6_stolen_funcs), + INTEL_VLV_D_IDS(&gen6_stolen_funcs), + INTEL_PINEVIEW_IDS(&gen3_stolen_funcs), + INTEL_I965G_IDS(&gen3_stolen_funcs), + INTEL_G33_IDS(&gen3_stolen_funcs), + INTEL_I965GM_IDS(&gen3_stolen_funcs), + INTEL_GM45_IDS(&gen3_stolen_funcs), + INTEL_G45_IDS(&gen3_stolen_funcs), + INTEL_IRONLAKE_D_IDS(&gen3_stolen_funcs), + INTEL_IRONLAKE_M_IDS(&gen3_stolen_funcs), + INTEL_SNB_D_IDS(&gen6_stolen_funcs), + INTEL_SNB_M_IDS(&gen6_stolen_funcs), + INTEL_IVB_M_IDS(&gen6_stolen_funcs), + INTEL_IVB_D_IDS(&gen6_stolen_funcs), + INTEL_HSW_D_IDS(&gen6_stolen_funcs), + INTEL_HSW_M_IDS(&gen6_stolen_funcs), + INTEL_BDW_M_IDS(&gen8_stolen_funcs), + INTEL_BDW_D_IDS(&gen8_stolen_funcs) }; static void __init intel_graphics_stolen(int num, int slot, int func) @@ -364,11 +513,13 @@ static void __init intel_graphics_stolen(int num, int slot, int func) for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) { if (intel_stolen_ids[i].device == device) { - stolen_size_fn stolen_size = - (stolen_size_fn)intel_stolen_ids[i].driver_data; - size = stolen_size(num, slot, func); - start = intel_stolen_base(num, slot, func); + const struct intel_stolen_funcs *stolen_funcs = + (const struct intel_stolen_funcs *)intel_stolen_ids[i].driver_data; + size = stolen_funcs->size(num, slot, func); + start = stolen_funcs->base(num, slot, func, size); if (size && start) { + printk(KERN_INFO "Reserving Intel graphics stolen memory at 0x%x-0x%x\n", + start, start + (u32)size - 1); /* Mark this space as reserved */ e820_add_region(start, size, E820_RESERVED); sanitize_e820_map(e820.map, @@ -380,6 +531,15 @@ static void __init intel_graphics_stolen(int num, int slot, int func) } } +static void __init force_disable_hpet(int num, int slot, int func) +{ +#ifdef CONFIG_HPET_TIMER + boot_hpet_disable = 1; + pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n"); +#endif +} + + #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -417,6 +577,12 @@ static struct chipset early_qrk[] __initdata = { PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, QFLAG_APPLY_ONCE, intel_graphics_stolen }, + /* + * HPET on current version of Baytrail platform has accuracy + * problems, disable it for now: + */ + { PCI_VENDOR_ID_INTEL, 0x0f00, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, {} }; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1e96c3628bf..be846d2468f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -36,7 +36,7 @@ * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack * frame that is otherwise undefined after a SYSCALL * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. - * - errorentry/paranoidentry/zeroentry - Define exception entry points. + * - idtentry - Define exception entry points. */ #include <linux/linkage.h> @@ -1203,125 +1203,100 @@ apicinterrupt IRQ_WORK_VECTOR \ /* * Exception entry points. */ -.macro zeroentry sym do_sym -ENTRY(\sym) - INTR_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call error_entry - DEFAULT_FRAME 0 - movq %rsp,%rdi /* pt_regs pointer */ - xorl %esi,%esi /* no error code */ - call \do_sym - jmp error_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm +#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) -.macro paranoidzeroentry sym do_sym +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) - INTR_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call save_paranoid - TRACE_IRQS_OFF - movq %rsp,%rdi /* pt_regs pointer */ - xorl %esi,%esi /* no error code */ - call \do_sym - jmp paranoid_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm + /* Sanity check */ + .if \shift_ist != -1 && \paranoid == 0 + .error "using shift_ist requires paranoid=1" + .endif -#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) -.macro paranoidzeroentry_ist sym do_sym ist -ENTRY(\sym) + .if \has_error_code + XCPT_FRAME + .else INTR_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call save_paranoid - TRACE_IRQS_OFF_DEBUG - movq %rsp,%rdi /* pt_regs pointer */ - xorl %esi,%esi /* no error code */ - subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) - call \do_sym - addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) - jmp paranoid_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm + .endif -.macro errorentry sym do_sym -ENTRY(\sym) - XCPT_FRAME ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME + + .ifeq \has_error_code + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + .endif + subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + + .if \paranoid + call save_paranoid + .else call error_entry + .endif + DEFAULT_FRAME 0 + + .if \paranoid + .if \shift_ist != -1 + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + .else + TRACE_IRQS_OFF + .endif + .endif + movq %rsp,%rdi /* pt_regs pointer */ + + .if \has_error_code movq ORIG_RAX(%rsp),%rsi /* get error code */ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi,%esi /* no error code */ + .endif + + .if \shift_ist != -1 + subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + .endif + call \do_sym + + .if \shift_ist != -1 + addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + .endif + + .if \paranoid + jmp paranoid_exit /* %ebx: no swapgs flag */ + .else jmp error_exit /* %ebx: no swapgs flag */ + .endif + CFI_ENDPROC END(\sym) .endm #ifdef CONFIG_TRACING -.macro trace_errorentry sym do_sym -errorentry trace(\sym) trace(\do_sym) -errorentry \sym \do_sym +.macro trace_idtentry sym do_sym has_error_code:req +idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code +idtentry \sym \do_sym has_error_code=\has_error_code .endm #else -.macro trace_errorentry sym do_sym -errorentry \sym \do_sym +.macro trace_idtentry sym do_sym has_error_code:req +idtentry \sym \do_sym has_error_code=\has_error_code .endm #endif - /* error code is on the stack already */ -.macro paranoiderrorentry sym do_sym -ENTRY(\sym) - XCPT_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call save_paranoid - DEFAULT_FRAME 0 - TRACE_IRQS_OFF - movq %rsp,%rdi /* pt_regs pointer */ - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ - call \do_sym - jmp paranoid_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm - -zeroentry divide_error do_divide_error -zeroentry overflow do_overflow -zeroentry bounds do_bounds -zeroentry invalid_op do_invalid_op -zeroentry device_not_available do_device_not_available -paranoiderrorentry double_fault do_double_fault -zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun -errorentry invalid_TSS do_invalid_TSS -errorentry segment_not_present do_segment_not_present -zeroentry spurious_interrupt_bug do_spurious_interrupt_bug -zeroentry coprocessor_error do_coprocessor_error -errorentry alignment_check do_alignment_check -zeroentry simd_coprocessor_error do_simd_coprocessor_error +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry double_fault do_double_fault has_error_code=1 paranoid=1 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 /* Reload gs selector with exception handling */ @@ -1371,7 +1346,7 @@ ENTRY(do_softirq_own_stack) END(do_softirq_own_stack) #ifdef CONFIG_XEN -zeroentry xen_hypervisor_callback xen_do_hypervisor_callback +idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 /* * A note on the "critical region" in our callback handler. @@ -1482,21 +1457,21 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ */ .pushsection .kprobes.text, "ax" -paranoidzeroentry_ist debug do_debug DEBUG_STACK -paranoidzeroentry_ist int3 do_int3 DEBUG_STACK -paranoiderrorentry stack_segment do_stack_segment +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1 #ifdef CONFIG_XEN -zeroentry xen_debug do_debug -zeroentry xen_int3 do_int3 -errorentry xen_stack_segment do_stack_segment +idtentry xen_debug do_debug has_error_code=0 +idtentry xen_int3 do_int3 has_error_code=0 +idtentry xen_stack_segment do_stack_segment has_error_code=1 #endif -errorentry general_protection do_general_protection -trace_errorentry page_fault do_page_fault +idtentry general_protection do_general_protection has_error_code=1 +trace_idtentry page_fault do_page_fault has_error_code=1 #ifdef CONFIG_KVM_GUEST -errorentry async_page_fault do_async_page_fault +idtentry async_page_fault do_async_page_fault has_error_code=1 #endif #ifdef CONFIG_X86_MCE -paranoidzeroentry machine_check *machine_check_vector(%rip) +idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) #endif /* diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index e6253195a30..52819e816f8 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -308,7 +308,10 @@ static int ftrace_write(unsigned long ip, const char *val, int size) if (within(ip, (unsigned long)_text, (unsigned long)_etext)) ip = (unsigned long)__va(__pa_symbol(ip)); - return probe_kernel_write((void *)ip, val, size); + if (probe_kernel_write((void *)ip, val, size)) + return -EPERM; + + return 0; } static int add_break(unsigned long ip, const char *old) @@ -323,10 +326,7 @@ static int add_break(unsigned long ip, const char *old) if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) return -EINVAL; - if (ftrace_write(ip, &brk, 1)) - return -EPERM; - - return 0; + return ftrace_write(ip, &brk, 1); } static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) @@ -425,7 +425,7 @@ static int remove_breakpoint(struct dyn_ftrace *rec) /* If this does not have a breakpoint, we are done */ if (ins[0] != brk) - return -1; + return 0; nop = ftrace_nop_replace(); @@ -455,7 +455,7 @@ static int remove_breakpoint(struct dyn_ftrace *rec) } update: - return probe_kernel_write((void *)ip, &nop[0], 1); + return ftrace_write(ip, nop, 1); } static int add_update_code(unsigned long ip, unsigned const char *new) @@ -463,9 +463,7 @@ static int add_update_code(unsigned long ip, unsigned const char *new) /* skip breakpoint */ ip++; new++; - if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1)) - return -EPERM; - return 0; + return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1); } static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) @@ -520,10 +518,7 @@ static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) new = ftrace_call_replace(ip, addr); - if (ftrace_write(ip, new, 1)) - return -EPERM; - - return 0; + return ftrace_write(ip, new, 1); } static int finish_update_nop(struct dyn_ftrace *rec) @@ -533,9 +528,7 @@ static int finish_update_nop(struct dyn_ftrace *rec) new = ftrace_nop_replace(); - if (ftrace_write(ip, new, 1)) - return -EPERM; - return 0; + return ftrace_write(ip, new, 1); } static int finish_update(struct dyn_ftrace *rec, int enable) @@ -632,8 +625,14 @@ void ftrace_replace_code(int enable) printk(KERN_WARNING "Failed on %s (%d):\n", report, count); for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); - remove_breakpoint(rec); + /* + * Breakpoints are handled only when this function is in + * progress. The system could not work with them. + */ + if (remove_breakpoint(rec)) + BUG(); } + run_sync(); } static int @@ -655,16 +654,19 @@ ftrace_modify_code(unsigned long ip, unsigned const char *old_code, run_sync(); ret = ftrace_write(ip, new_code, 1); - if (ret) { - ret = -EPERM; - goto out; - } - run_sync(); + /* + * The breakpoint is handled only when this function is in progress. + * The system could not work if we could not remove it. + */ + BUG_ON(ret); out: + run_sync(); return ret; fail_update: - probe_kernel_write((void *)ip, &old_code[0], 1); + /* Also here the system could not work with the breakpoint */ + if (ftrace_write(ip, old_code, 1)) + BUG(); goto out; } @@ -678,11 +680,8 @@ void arch_ftrace_update_code(int command) atomic_dec(&modifying_ftrace_code); } -int __init ftrace_dyn_arch_init(void *data) +int __init ftrace_dyn_arch_init(void) { - /* The return code is retured via data */ - *(unsigned long *)data = 0; - return 0; } #endif diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index c61a14a4a31..d6c1b983699 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,7 +29,7 @@ static void __init i386_default_early_setup(void) reserve_ebda_region(); } -asmlinkage void __init i386_start_kernel(void) +asmlinkage __visible void __init i386_start_kernel(void) { sanitize_boot_params(&boot_params); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 85126ccbdf6..068054f4bf2 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -137,7 +137,7 @@ static void __init copy_bootdata(char *real_mode_data) } } -asmlinkage void __init x86_64_start_kernel(char * real_mode_data) +asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) { int i; diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index da85a8e830a..4177bfbc80b 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -88,7 +88,7 @@ static inline void hpet_clear_mapping(void) /* * HPET command line enable / disable */ -static int boot_hpet_disable; +int boot_hpet_disable; int hpet_force_user; static int hpet_verbose; @@ -521,7 +521,7 @@ static int hpet_setup_irq(struct hpet_dev *dev) { if (request_irq(dev->irq, hpet_interrupt_handler, - IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + IRQF_TIMER | IRQF_NOBALANCING, dev->name, dev)) return -1; @@ -699,7 +699,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n, /* FIXME: add schedule_work_on() */ schedule_delayed_work_on(cpu, &work.work, 0); wait_for_completion(&work.complete); - destroy_timer_on_stack(&work.work.timer); + destroy_delayed_work_on_stack(&work.work); break; case CPU_DEAD: if (hdev) { @@ -752,9 +752,7 @@ static struct clocksource clocksource_hpet = { .mask = HPET_MASK, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, -#ifdef CONFIG_X86_64 .archdata = { .vclock_mode = VCLOCK_HPET }, -#endif }; static int hpet_clocksource_register(void) @@ -943,12 +941,14 @@ static __init int hpet_late_init(void) if (boot_cpu_has(X86_FEATURE_ARAT)) return 0; + cpu_notifier_register_begin(); for_each_online_cpu(cpu) { hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); } /* This notifier should be called after workqueue is ready */ - hotcpu_notifier(hpet_cpuhp_notify, -20); + __hotcpu_notifier(hpet_cpuhp_notify, -20); + cpu_notifier_register_done(); return 0; } diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c index c3aae667284..d30acdc1229 100644 --- a/arch/x86/kernel/iosf_mbi.c +++ b/arch/x86/kernel/iosf_mbi.c @@ -25,6 +25,9 @@ #include <asm/iosf_mbi.h> +#define PCI_DEVICE_ID_BAYTRAIL 0x0F00 +#define PCI_DEVICE_ID_QUARK_X1000 0x0958 + static DEFINE_SPINLOCK(iosf_mbi_lock); static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset) @@ -177,6 +180,13 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) } EXPORT_SYMBOL(iosf_mbi_modify); +bool iosf_mbi_available(void) +{ + /* Mbi isn't hot-pluggable. No remove routine is provided */ + return mbi_pdev; +} +EXPORT_SYMBOL(iosf_mbi_available); + static int iosf_mbi_probe(struct pci_dev *pdev, const struct pci_device_id *unused) { @@ -193,7 +203,8 @@ static int iosf_mbi_probe(struct pci_dev *pdev, } static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = { - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0F00) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) }, { 0, }, }; MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d99f31d9a75..283a76a9cc4 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -125,6 +125,12 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); seq_printf(p, " Machine check polls\n"); #endif +#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) + seq_printf(p, "%*s: ", prec, "THR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count); + seq_printf(p, " Hypervisor callback interrupts\n"); +#endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index d7fcbedc9c4..63ce838e5a5 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -55,16 +55,8 @@ static inline int check_stack_overflow(void) { return 0; } static inline void print_stack_overflow(void) { } #endif -/* - * per-CPU IRQ handling contexts (thread information and stack) - */ -union irq_ctx { - struct thread_info tinfo; - u32 stack[THREAD_SIZE/sizeof(u32)]; -} __attribute__((aligned(THREAD_SIZE))); - -static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); -static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack); +DEFINE_PER_CPU(struct irq_stack *, softirq_stack); static void call_on_stack(void *func, void *stack) { @@ -77,14 +69,26 @@ static void call_on_stack(void *func, void *stack) : "memory", "cc", "edx", "ecx", "eax"); } +/* how to get the current stack pointer from C */ +#define current_stack_pointer ({ \ + unsigned long sp; \ + asm("mov %%esp,%0" : "=g" (sp)); \ + sp; \ +}) + +static inline void *current_stack(void) +{ + return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); +} + static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { - union irq_ctx *curctx, *irqctx; - u32 *isp, arg1, arg2; + struct irq_stack *curstk, *irqstk; + u32 *isp, *prev_esp, arg1, arg2; - curctx = (union irq_ctx *) current_thread_info(); - irqctx = __this_cpu_read(hardirq_ctx); + curstk = (struct irq_stack *) current_stack(); + irqstk = __this_cpu_read(hardirq_stack); /* * this is where we switch to the IRQ stack. However, if we are @@ -92,13 +96,14 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) * handler) we can't do that and just have to keep using the * current stack (which is the irq stack already after all) */ - if (unlikely(curctx == irqctx)) + if (unlikely(curstk == irqstk)) return 0; - /* build the stack frame on the IRQ stack */ - isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); - irqctx->tinfo.task = curctx->tinfo.task; - irqctx->tinfo.previous_esp = current_stack_pointer; + isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); + + /* Save the next esp at the bottom of the stack */ + prev_esp = (u32 *)irqstk; + *prev_esp = current_stack_pointer; if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); @@ -118,46 +123,40 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) */ void irq_ctx_init(int cpu) { - union irq_ctx *irqctx; + struct irq_stack *irqstk; - if (per_cpu(hardirq_ctx, cpu)) + if (per_cpu(hardirq_stack, cpu)) return; - irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), + irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), THREADINFO_GFP, THREAD_SIZE_ORDER)); - memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); - irqctx->tinfo.cpu = cpu; - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); - - per_cpu(hardirq_ctx, cpu) = irqctx; + per_cpu(hardirq_stack, cpu) = irqstk; - irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), + irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), THREADINFO_GFP, THREAD_SIZE_ORDER)); - memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); - irqctx->tinfo.cpu = cpu; - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); - - per_cpu(softirq_ctx, cpu) = irqctx; + per_cpu(softirq_stack, cpu) = irqstk; printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", - cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); + cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu)); } void do_softirq_own_stack(void) { - struct thread_info *curctx; - union irq_ctx *irqctx; - u32 *isp; + struct thread_info *curstk; + struct irq_stack *irqstk; + u32 *isp, *prev_esp; - curctx = current_thread_info(); - irqctx = __this_cpu_read(softirq_ctx); - irqctx->tinfo.task = curctx->task; - irqctx->tinfo.previous_esp = current_stack_pointer; + curstk = current_stack(); + irqstk = __this_cpu_read(softirq_stack); /* build the stack frame on the softirq stack */ - isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); + isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); + + /* Push the previous esp onto the stack */ + prev_esp = (u32 *)irqstk; + *prev_esp = current_stack_pointer; call_on_stack(__do_softirq, isp); } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 79a3f968287..61b17dc2c27 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -897,9 +897,10 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) struct kprobe *cur = kprobe_running(); struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - switch (kcb->kprobe_status) { - case KPROBE_HIT_SS: - case KPROBE_REENTER: + if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) { + /* This must happen on single-stepping */ + WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS && + kcb->kprobe_status != KPROBE_REENTER); /* * We are here because the instruction being single * stepped caused a page fault. We reset the current @@ -914,9 +915,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) else reset_current_kprobe(); preempt_enable_no_resched(); - break; - case KPROBE_HIT_ACTIVE: - case KPROBE_HIT_SSDONE: + } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE || + kcb->kprobe_status == KPROBE_HIT_SSDONE) { /* * We increment the nmissed count for accounting, * we can also use npre/npostfault count for accounting @@ -945,10 +945,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) * fixup routine could not handle it, * Let do_page_fault() fix it. */ - break; - default: - break; } + return 0; } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 713f1b3bad5..0331cb389d6 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -417,7 +417,6 @@ void kvm_disable_steal_time(void) #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { - WARN_ON(kvm_register_clock("primary cpu clock")); kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); kvm_spinlock_init(); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index e6041094ff2..d9156ceecdf 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -242,7 +242,7 @@ void __init kvmclock_init(void) hv_clock = __va(mem); memset(hv_clock, 0, size); - if (kvm_register_clock("boot clock")) { + if (kvm_register_clock("primary cpu clock")) { hv_clock = NULL; memblock_free(mem, size); return; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ebc98739892..dcbbaa165bd 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -20,6 +20,8 @@ #include <asm/mmu_context.h> #include <asm/syscalls.h> +int sysctl_ldt16 = 0; + #ifdef CONFIG_SMP static void flush_ldt(void *current_mm) { @@ -229,6 +231,17 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) } } + /* + * On x86-64 we do not support 16-bit segments due to + * IRET leaking the high bits of the kernel stack address. + */ +#ifdef CONFIG_X86_64 + if (!ldt_info.seg_32bit && !sysctl_ldt16) { + error = -EINVAL; + goto out_unlock; + } +#endif + fill_ldt(&ldt, &ldt_info); if (oldmode) ldt.avl = 0; diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 05266b5aae2..c9603ac80de 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -259,14 +259,15 @@ static int __init msr_init(void) goto out_chrdev; } msr_class->devnode = msr_devnode; - get_online_cpus(); + + cpu_notifier_register_begin(); for_each_online_cpu(i) { err = msr_device_create(i); if (err != 0) goto out_class; } - register_hotcpu_notifier(&msr_class_cpu_notifier); - put_online_cpus(); + __register_hotcpu_notifier(&msr_class_cpu_notifier); + cpu_notifier_register_done(); err = 0; goto out; @@ -275,7 +276,7 @@ out_class: i = 0; for_each_online_cpu(i) msr_device_destroy(i); - put_online_cpus(); + cpu_notifier_register_done(); class_destroy(msr_class); out_chrdev: __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); @@ -286,13 +287,14 @@ out: static void __exit msr_exit(void) { int cpu = 0; - get_online_cpus(); + + cpu_notifier_register_begin(); for_each_online_cpu(cpu) msr_device_destroy(cpu); class_destroy(msr_class); __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); - unregister_hotcpu_notifier(&msr_class_cpu_notifier); - put_online_cpus(); + __unregister_hotcpu_notifier(&msr_class_cpu_notifier); + cpu_notifier_register_done(); } module_init(msr_init); diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 299d49302e7..0497f719977 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1207,23 +1207,31 @@ error: return ret; } -static inline int __init determine_tce_table_size(u64 ram) +static inline int __init determine_tce_table_size(void) { int ret; if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED) return specified_table_size; - /* - * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to - * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each - * larger table size has twice as many entries, so shift the - * max ram address by 13 to divide by 8K and then look at the - * order of the result to choose between 0-7. - */ - ret = get_order(ram >> 13); - if (ret > TCE_TABLE_SIZE_8M) + if (is_kdump_kernel() && saved_max_pfn) { + /* + * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to + * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each + * larger table size has twice as many entries, so shift the + * max ram address by 13 to divide by 8K and then look at the + * order of the result to choose between 0-7. + */ + ret = get_order((saved_max_pfn * PAGE_SIZE) >> 13); + if (ret > TCE_TABLE_SIZE_8M) + ret = TCE_TABLE_SIZE_8M; + } else { + /* + * Use 8M by default (suggested by Muli) if it's not + * kdump kernel and saved_max_pfn isn't set. + */ ret = TCE_TABLE_SIZE_8M; + } return ret; } @@ -1418,8 +1426,7 @@ int __init detect_calgary(void) return -ENOMEM; } - specified_table_size = determine_tce_table_size((is_kdump_kernel() ? - saved_max_pfn : max_pfn) * PAGE_SIZE); + specified_table_size = determine_tce_table_size(); for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { struct calgary_bus_info *info = &bus_info[bus]; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0de43e98ce0..7bc86bbe748 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -314,6 +314,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_end_context_switch(next_p); + this_cpu_write(kernel_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - KERNEL_STACK_OFFSET); + /* * Restore %gs if needed (which is common) */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9c0280f93d0..898d077617a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ asmlinkage extern void ret_from_fork(void); -asmlinkage DEFINE_PER_CPU(unsigned long, old_rsp); +__visible DEFINE_PER_CPU(unsigned long, old_rsp); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 7461f50d5bb..678c0ada3b3 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -184,14 +184,14 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs) { unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1); unsigned long sp = (unsigned long)®s->sp; - struct thread_info *tinfo; + u32 *prev_esp; if (context == (sp & ~(THREAD_SIZE - 1))) return sp; - tinfo = (struct thread_info *)context; - if (tinfo->previous_esp) - return tinfo->previous_esp; + prev_esp = (u32 *)(context); + if (prev_esp) + return (unsigned long)prev_esp; return (unsigned long)regs; } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index c752cb43e52..52b1157c53e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -114,8 +114,8 @@ EXPORT_SYMBOL(machine_real_restart); */ static int __init set_pci_reboot(const struct dmi_system_id *d) { - if (reboot_type != BOOT_CF9) { - reboot_type = BOOT_CF9; + if (reboot_type != BOOT_CF9_FORCE) { + reboot_type = BOOT_CF9_FORCE; pr_info("%s series board detected. Selecting %s-method for reboots.\n", d->ident, "PCI"); } @@ -191,6 +191,16 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { }, }, + /* Certec */ + { /* Handle problems with rebooting on Certec BPC600 */ + .callback = set_pci_reboot, + .ident = "Certec BPC600", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Certec"), + DMI_MATCH(DMI_PRODUCT_NAME, "BPC600"), + }, + }, + /* Dell */ { /* Handle problems with rebooting on Dell DXP061 */ .callback = set_bios_reboot, @@ -458,17 +468,23 @@ void __attribute__((weak)) mach_reboot_fixups(void) } /* - * Windows compatible x86 hardware expects the following on reboot: + * To the best of our knowledge Windows compatible x86 hardware expects + * the following on reboot: * * 1) If the FADT has the ACPI reboot register flag set, try it * 2) If still alive, write to the keyboard controller * 3) If still alive, write to the ACPI reboot register again * 4) If still alive, write to the keyboard controller again + * 5) If still alive, call the EFI runtime service to reboot + * 6) If no EFI runtime service, call the BIOS to do a reboot + * + * We default to following the same pattern. We also have + * two other reboot methods: 'triple fault' and 'PCI', which + * can be triggered via the reboot= kernel boot option or + * via quirks. * - * If the machine is still alive at this stage, it gives up. We default to - * following the same pattern, except that if we're still alive after (4) we'll - * try to force a triple fault and then cycle between hitting the keyboard - * controller and doing that + * This means that this function can never return, it can misbehave + * by not rebooting properly and hanging. */ static void native_machine_emergency_restart(void) { @@ -489,6 +505,11 @@ static void native_machine_emergency_restart(void) for (;;) { /* Could also try the reset bit in the Hammer NB */ switch (reboot_type) { + case BOOT_ACPI: + acpi_reboot(); + reboot_type = BOOT_KBD; + break; + case BOOT_KBD: mach_reboot_fixups(); /* For board specific fixups */ @@ -502,45 +523,33 @@ static void native_machine_emergency_restart(void) attempt = 1; reboot_type = BOOT_ACPI; } else { - reboot_type = BOOT_TRIPLE; + reboot_type = BOOT_EFI; } break; - case BOOT_TRIPLE: - load_idt(&no_idt); - __asm__ __volatile__("int3"); - - reboot_type = BOOT_KBD; - break; - - case BOOT_BIOS: - machine_real_restart(MRR_BIOS); - - reboot_type = BOOT_KBD; - break; - - case BOOT_ACPI: - acpi_reboot(); - reboot_type = BOOT_KBD; - break; - case BOOT_EFI: if (efi_enabled(EFI_RUNTIME_SERVICES)) efi.reset_system(reboot_mode == REBOOT_WARM ? EFI_RESET_WARM : EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); - reboot_type = BOOT_KBD; + reboot_type = BOOT_BIOS; + break; + + case BOOT_BIOS: + machine_real_restart(MRR_BIOS); + + /* We're probably dead after this, but... */ + reboot_type = BOOT_CF9_SAFE; break; - case BOOT_CF9: + case BOOT_CF9_FORCE: port_cf9_safe = true; /* Fall through */ - case BOOT_CF9_COND: + case BOOT_CF9_SAFE: if (port_cf9_safe) { - u8 reboot_code = reboot_mode == REBOOT_WARM ? - 0x06 : 0x0E; + u8 reboot_code = reboot_mode == REBOOT_WARM ? 0x06 : 0x0E; u8 cf9 = inb(0xcf9) & ~reboot_code; outb(cf9|2, 0xcf9); /* Request hard reset */ udelay(50); @@ -548,6 +557,14 @@ static void native_machine_emergency_restart(void) outb(cf9|reboot_code, 0xcf9); udelay(50); } + reboot_type = BOOT_TRIPLE; + break; + + case BOOT_TRIPLE: + load_idt(&no_idt); + __asm__ __volatile__("int3"); + + /* We're probably dead after this, but... */ reboot_type = BOOT_KBD; break; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index fa511acff7e..09c76d26555 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -869,7 +869,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); - visws_early_detect(); /* * copy kernel address range established so far and switch diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 7c3a5a61f2e..be8e1bde07a 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -168,7 +168,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) * this function calls the 'stop' function on all other CPUs in the system. */ -asmlinkage void smp_reboot_interrupt(void) +asmlinkage __visible void smp_reboot_interrupt(void) { ack_APIC_irq(); irq_enter(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 60179ec39d4..34826934d4a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -766,10 +766,10 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) #else clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); +#endif per_cpu(kernel_stack, cpu) = (unsigned long)task_stack_page(idle) - KERNEL_STACK_OFFSET + THREAD_SIZE; -#endif early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; stack_start = idle->thread.sp; @@ -1387,7 +1387,7 @@ static inline void mwait_play_dead(void) if (!this_cpu_has(X86_FEATURE_MWAIT)) return; - if (!this_cpu_has(X86_FEATURE_CLFLSH)) + if (!this_cpu_has(X86_FEATURE_CLFLUSH)) return; if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) return; diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 6ec91c00d84..bf7ef5ce29d 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -62,7 +62,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, + .flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, .name = "timer" }; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 57409f6b8c6..f73b5d435bd 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -357,7 +357,7 @@ exit: * for scheduling or signal handling. The actual stack switch is done in * entry.S */ -asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage __visible __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; /* Did already sync */ @@ -601,11 +601,11 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) #endif } -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void) { } -asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) +asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void) { } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cfbe99f8883..57e5ce126d5 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -914,8 +914,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, tsc_khz_ref = tsc_khz; } if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); @@ -985,9 +984,7 @@ static struct clocksource clocksource_tsc = { .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, -#ifdef CONFIG_X86_64 .archdata = { .vclock_mode = VCLOCK_TSC }, -#endif }; void mark_tsc_unstable(char *reason) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 2ed845928b5..ace22916ade 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -53,7 +53,7 @@ #define OPCODE1(insn) ((insn)->opcode.bytes[0]) #define OPCODE2(insn) ((insn)->opcode.bytes[1]) #define OPCODE3(insn) ((insn)->opcode.bytes[2]) -#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value) +#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ @@ -229,63 +229,6 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) return -ENOTSUPP; } -/* - * Figure out which fixups arch_uprobe_post_xol() will need to perform, and - * annotate arch_uprobe->fixups accordingly. To start with, - * arch_uprobe->fixups is either zero or it reflects rip-related fixups. - */ -static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) -{ - bool fix_ip = true, fix_call = false; /* defaults */ - int reg; - - insn_get_opcode(insn); /* should be a nop */ - - switch (OPCODE1(insn)) { - case 0x9d: - /* popf */ - auprobe->fixups |= UPROBE_FIX_SETF; - break; - case 0xc3: /* ret/lret */ - case 0xcb: - case 0xc2: - case 0xca: - /* ip is correct */ - fix_ip = false; - break; - case 0xe8: /* call relative - Fix return addr */ - fix_call = true; - break; - case 0x9a: /* call absolute - Fix return addr, not ip */ - fix_call = true; - fix_ip = false; - break; - case 0xff: - insn_get_modrm(insn); - reg = MODRM_REG(insn); - if (reg == 2 || reg == 3) { - /* call or lcall, indirect */ - /* Fix return addr; ip is correct. */ - fix_call = true; - fix_ip = false; - } else if (reg == 4 || reg == 5) { - /* jmp or ljmp, indirect */ - /* ip is correct. */ - fix_ip = false; - } - break; - case 0xea: /* jmp absolute -- ip is correct */ - fix_ip = false; - break; - default: - break; - } - if (fix_ip) - auprobe->fixups |= UPROBE_FIX_IP; - if (fix_call) - auprobe->fixups |= UPROBE_FIX_CALL; -} - #ifdef CONFIG_X86_64 /* * If arch_uprobe->insn doesn't use rip-relative addressing, return @@ -310,15 +253,11 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) * - The displacement is always 4 bytes. */ static void -handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) { u8 *cursor; u8 reg; - if (mm->context.ia32_compat) - return; - - auprobe->rip_rela_target_address = 0x0; if (!insn_rip_relative(insn)) return; @@ -372,7 +311,48 @@ handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct ins cursor++; memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes); } - return; +} + +/* + * If we're emulating a rip-relative instruction, save the contents + * of the scratch register and store the target address in that register. + */ +static void +pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, + struct arch_uprobe_task *autask) +{ + if (auprobe->fixups & UPROBE_FIX_RIP_AX) { + autask->saved_scratch_register = regs->ax; + regs->ax = current->utask->vaddr; + regs->ax += auprobe->rip_rela_target_address; + } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { + autask->saved_scratch_register = regs->cx; + regs->cx = current->utask->vaddr; + regs->cx += auprobe->rip_rela_target_address; + } +} + +static void +handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +{ + if (auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { + struct arch_uprobe_task *autask; + + autask = ¤t->utask->autask; + if (auprobe->fixups & UPROBE_FIX_RIP_AX) + regs->ax = autask->saved_scratch_register; + else + regs->cx = autask->saved_scratch_register; + + /* + * The original instruction includes a displacement, and so + * is 4 bytes longer than what we've just single-stepped. + * Caller may need to apply other fixups to handle stuff + * like "jmpq *...(%rip)" and "callq *...(%rip)". + */ + if (correction) + *correction += 4; + } } static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) @@ -401,9 +381,19 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, return validate_insn_64bits(auprobe, insn); } #else /* 32-bit: */ -static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +/* + * No RIP-relative addressing on 32-bit + */ +static void handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) +{ +} +static void pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, + struct arch_uprobe_task *autask) +{ +} +static void handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, + long *correction) { - /* No RIP-relative addressing on 32-bit */ } static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) @@ -412,141 +402,311 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, } #endif /* CONFIG_X86_64 */ -/** - * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. - * @mm: the probed address space. - * @arch_uprobe: the probepoint information. - * @addr: virtual address at which to install the probepoint - * Return 0 on success or a -ve number on error. +struct uprobe_xol_ops { + bool (*emulate)(struct arch_uprobe *, struct pt_regs *); + int (*pre_xol)(struct arch_uprobe *, struct pt_regs *); + int (*post_xol)(struct arch_uprobe *, struct pt_regs *); +}; + +static inline int sizeof_long(void) +{ + return is_ia32_task() ? 4 : 8; +} + +static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + pre_xol_rip_insn(auprobe, regs, ¤t->utask->autask); + return 0; +} + +/* + * Adjust the return address pushed by a call insn executed out of line. */ -int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) +static int adjust_ret_addr(unsigned long sp, long correction) { - int ret; - struct insn insn; + int rasize = sizeof_long(); + long ra; - auprobe->fixups = 0; - ret = validate_insn_bits(auprobe, mm, &insn); - if (ret != 0) - return ret; + if (copy_from_user(&ra, (void __user *)sp, rasize)) + return -EFAULT; - handle_riprel_insn(auprobe, mm, &insn); - prepare_fixups(auprobe, &insn); + ra += correction; + if (copy_to_user((void __user *)sp, &ra, rasize)) + return -EFAULT; return 0; } -#ifdef CONFIG_X86_64 -/* - * If we're emulating a rip-relative instruction, save the contents - * of the scratch register and store the target address in that register. - */ -static void -pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, - struct arch_uprobe_task *autask) +static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { - if (auprobe->fixups & UPROBE_FIX_RIP_AX) { - autask->saved_scratch_register = regs->ax; - regs->ax = current->utask->vaddr; - regs->ax += auprobe->rip_rela_target_address; - } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { - autask->saved_scratch_register = regs->cx; - regs->cx = current->utask->vaddr; - regs->cx += auprobe->rip_rela_target_address; + struct uprobe_task *utask = current->utask; + long correction = (long)(utask->vaddr - utask->xol_vaddr); + + handle_riprel_post_xol(auprobe, regs, &correction); + if (auprobe->fixups & UPROBE_FIX_IP) + regs->ip += correction; + + if (auprobe->fixups & UPROBE_FIX_CALL) { + if (adjust_ret_addr(regs->sp, correction)) { + regs->sp += sizeof_long(); + return -ERESTART; + } } + + return 0; } -#else -static void -pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, - struct arch_uprobe_task *autask) + +static struct uprobe_xol_ops default_xol_ops = { + .pre_xol = default_pre_xol_op, + .post_xol = default_post_xol_op, +}; + +static bool branch_is_call(struct arch_uprobe *auprobe) { - /* No RIP-relative addressing on 32-bit */ + return auprobe->branch.opc1 == 0xe8; } -#endif -/* - * arch_uprobe_pre_xol - prepare to execute out of line. - * @auprobe: the probepoint information. - * @regs: reflects the saved user state of current task. - */ -int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) -{ - struct arch_uprobe_task *autask; +#define CASE_COND \ + COND(70, 71, XF(OF)) \ + COND(72, 73, XF(CF)) \ + COND(74, 75, XF(ZF)) \ + COND(78, 79, XF(SF)) \ + COND(7a, 7b, XF(PF)) \ + COND(76, 77, XF(CF) || XF(ZF)) \ + COND(7c, 7d, XF(SF) != XF(OF)) \ + COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF)) - autask = ¤t->utask->autask; - autask->saved_trap_nr = current->thread.trap_nr; - current->thread.trap_nr = UPROBE_TRAP_NR; - regs->ip = current->utask->xol_vaddr; - pre_xol_rip_insn(auprobe, regs, autask); +#define COND(op_y, op_n, expr) \ + case 0x ## op_y: DO((expr) != 0) \ + case 0x ## op_n: DO((expr) == 0) - autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); - regs->flags |= X86_EFLAGS_TF; - if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) - set_task_blockstep(current, false); +#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf)) - return 0; +static bool is_cond_jmp_opcode(u8 opcode) +{ + switch (opcode) { + #define DO(expr) \ + return true; + CASE_COND + #undef DO + + default: + return false; + } } -/* - * This function is called by arch_uprobe_post_xol() to adjust the return - * address pushed by a call instruction executed out of line. - */ -static int adjust_ret_addr(unsigned long sp, long correction) +static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs) { - int rasize, ncopied; - long ra = 0; + unsigned long flags = regs->flags; - if (is_ia32_task()) - rasize = 4; - else - rasize = 8; + switch (auprobe->branch.opc1) { + #define DO(expr) \ + return expr; + CASE_COND + #undef DO - ncopied = copy_from_user(&ra, (void __user *)sp, rasize); - if (unlikely(ncopied)) - return -EFAULT; + default: /* not a conditional jmp */ + return true; + } +} - ra += correction; - ncopied = copy_to_user((void __user *)sp, &ra, rasize); - if (unlikely(ncopied)) - return -EFAULT; +#undef XF +#undef COND +#undef CASE_COND - return 0; +static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + unsigned long new_ip = regs->ip += auprobe->branch.ilen; + unsigned long offs = (long)auprobe->branch.offs; + + if (branch_is_call(auprobe)) { + unsigned long new_sp = regs->sp - sizeof_long(); + /* + * If it fails we execute this (mangled, see the comment in + * branch_clear_offset) insn out-of-line. In the likely case + * this should trigger the trap, and the probed application + * should die or restart the same insn after it handles the + * signal, arch_uprobe_post_xol() won't be even called. + * + * But there is corner case, see the comment in ->post_xol(). + */ + if (copy_to_user((void __user *)new_sp, &new_ip, sizeof_long())) + return false; + regs->sp = new_sp; + } else if (!check_jmp_cond(auprobe, regs)) { + offs = 0; + } + + regs->ip = new_ip + offs; + return true; } -#ifdef CONFIG_X86_64 -static bool is_riprel_insn(struct arch_uprobe *auprobe) +static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { - return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0); + BUG_ON(!branch_is_call(auprobe)); + /* + * We can only get here if branch_emulate_op() failed to push the ret + * address _and_ another thread expanded our stack before the (mangled) + * "call" insn was executed out-of-line. Just restore ->sp and restart. + * We could also restore ->ip and try to call branch_emulate_op() again. + */ + regs->sp += sizeof_long(); + return -ERESTART; } -static void -handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn) { - if (is_riprel_insn(auprobe)) { - struct arch_uprobe_task *autask; + /* + * Turn this insn into "call 1f; 1:", this is what we will execute + * out-of-line if ->emulate() fails. We only need this to generate + * a trap, so that the probed task receives the correct signal with + * the properly filled siginfo. + * + * But see the comment in ->post_xol(), in the unlikely case it can + * succeed. So we need to ensure that the new ->ip can not fall into + * the non-canonical area and trigger #GP. + * + * We could turn it into (say) "pushf", but then we would need to + * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte + * of ->insn[] for set_orig_insn(). + */ + memset(auprobe->insn + insn_offset_immediate(insn), + 0, insn->immediate.nbytes); +} - autask = ¤t->utask->autask; - if (auprobe->fixups & UPROBE_FIX_RIP_AX) - regs->ax = autask->saved_scratch_register; - else - regs->cx = autask->saved_scratch_register; +static struct uprobe_xol_ops branch_xol_ops = { + .emulate = branch_emulate_op, + .post_xol = branch_post_xol_op, +}; + +/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ +static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) +{ + u8 opc1 = OPCODE1(insn); + + /* has the side-effect of processing the entire instruction */ + insn_get_length(insn); + if (WARN_ON_ONCE(!insn_complete(insn))) + return -ENOEXEC; + + switch (opc1) { + case 0xeb: /* jmp 8 */ + case 0xe9: /* jmp 32 */ + case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ + break; + + case 0xe8: /* call relative */ + branch_clear_offset(auprobe, insn); + break; + case 0x0f: + if (insn->opcode.nbytes != 2) + return -ENOSYS; /* - * The original instruction includes a displacement, and so - * is 4 bytes longer than what we've just single-stepped. - * Fall through to handle stuff like "jmpq *...(%rip)" and - * "callq *...(%rip)". + * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches + * OPCODE1() of the "short" jmp which checks the same condition. */ - if (correction) - *correction += 4; + opc1 = OPCODE2(insn) - 0x10; + default: + if (!is_cond_jmp_opcode(opc1)) + return -ENOSYS; } + + auprobe->branch.opc1 = opc1; + auprobe->branch.ilen = insn->length; + auprobe->branch.offs = insn->immediate.value; + + auprobe->ops = &branch_xol_ops; + return 0; } -#else -static void -handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) + +/** + * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. + * @mm: the probed address space. + * @arch_uprobe: the probepoint information. + * @addr: virtual address at which to install the probepoint + * Return 0 on success or a -ve number on error. + */ +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) +{ + struct insn insn; + bool fix_ip = true, fix_call = false; + int ret; + + ret = validate_insn_bits(auprobe, mm, &insn); + if (ret) + return ret; + + ret = branch_setup_xol_ops(auprobe, &insn); + if (ret != -ENOSYS) + return ret; + + /* + * Figure out which fixups arch_uprobe_post_xol() will need to perform, + * and annotate arch_uprobe->fixups accordingly. To start with, ->fixups + * is either zero or it reflects rip-related fixups. + */ + switch (OPCODE1(&insn)) { + case 0x9d: /* popf */ + auprobe->fixups |= UPROBE_FIX_SETF; + break; + case 0xc3: /* ret or lret -- ip is correct */ + case 0xcb: + case 0xc2: + case 0xca: + fix_ip = false; + break; + case 0x9a: /* call absolute - Fix return addr, not ip */ + fix_call = true; + fix_ip = false; + break; + case 0xea: /* jmp absolute -- ip is correct */ + fix_ip = false; + break; + case 0xff: + insn_get_modrm(&insn); + switch (MODRM_REG(&insn)) { + case 2: case 3: /* call or lcall, indirect */ + fix_call = true; + case 4: case 5: /* jmp or ljmp, indirect */ + fix_ip = false; + } + /* fall through */ + default: + handle_riprel_insn(auprobe, &insn); + } + + if (fix_ip) + auprobe->fixups |= UPROBE_FIX_IP; + if (fix_call) + auprobe->fixups |= UPROBE_FIX_CALL; + + auprobe->ops = &default_xol_ops; + return 0; +} + +/* + * arch_uprobe_pre_xol - prepare to execute out of line. + * @auprobe: the probepoint information. + * @regs: reflects the saved user state of current task. + */ +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - /* No RIP-relative addressing on 32-bit */ + struct uprobe_task *utask = current->utask; + + regs->ip = utask->xol_vaddr; + utask->autask.saved_trap_nr = current->thread.trap_nr; + current->thread.trap_nr = UPROBE_TRAP_NR; + + utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF); + regs->flags |= X86_EFLAGS_TF; + if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) + set_task_blockstep(current, false); + + if (auprobe->ops->pre_xol) + return auprobe->ops->pre_xol(auprobe, regs); + return 0; } -#endif /* * If xol insn itself traps and generates a signal(Say, @@ -592,22 +752,25 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) */ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - struct uprobe_task *utask; - long correction; - int result = 0; + struct uprobe_task *utask = current->utask; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); - utask = current->utask; - current->thread.trap_nr = utask->autask.saved_trap_nr; - correction = (long)(utask->vaddr - utask->xol_vaddr); - handle_riprel_post_xol(auprobe, regs, &correction); - if (auprobe->fixups & UPROBE_FIX_IP) - regs->ip += correction; - - if (auprobe->fixups & UPROBE_FIX_CALL) - result = adjust_ret_addr(regs->sp, correction); + if (auprobe->ops->post_xol) { + int err = auprobe->ops->post_xol(auprobe, regs); + if (err) { + arch_uprobe_abort_xol(auprobe, regs); + /* + * Restart the probed insn. ->post_xol() must ensure + * this is really possible if it returns -ERESTART. + */ + if (err == -ERESTART) + return 0; + return err; + } + } + current->thread.trap_nr = utask->autask.saved_trap_nr; /* * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP * so we can get an extra SIGTRAP if we do not clear TF. We need @@ -618,7 +781,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) else if (!(auprobe->fixups & UPROBE_FIX_SETF)) regs->flags &= ~X86_EFLAGS_TF; - return result; + return 0; } /* callback routine for handling exceptions. */ @@ -652,8 +815,9 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, /* * This function gets called when XOL instruction either gets trapped or - * the thread has a fatal signal, so reset the instruction pointer to its - * probed address. + * the thread has a fatal signal, or if arch_uprobe_post_xol() failed. + * Reset the instruction pointer to its probed address for the potential + * restart or for post mortem analysis. */ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { @@ -668,25 +832,10 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) regs->flags &= ~X86_EFLAGS_TF; } -/* - * Skip these instructions as per the currently known x86 ISA. - * rep=0x66*; nop=0x90 - */ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { - int i; - - for (i = 0; i < MAX_UINSN_BYTES; i++) { - if (auprobe->insn[i] == 0x66) - continue; - - if (auprobe->insn[i] == 0x90) { - regs->ip += i + 1; - return true; - } - - break; - } + if (auprobe->ops->emulate) + return auprobe->ops->emulate(auprobe, regs); return false; } @@ -701,23 +850,21 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { - int rasize, ncopied; + int rasize = sizeof_long(), nleft; unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ - rasize = is_ia32_task() ? 4 : 8; - ncopied = copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize); - if (unlikely(ncopied)) + if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize)) return -1; /* check whether address has been already hijacked */ if (orig_ret_vaddr == trampoline_vaddr) return orig_ret_vaddr; - ncopied = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); - if (likely(!ncopied)) + nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); + if (likely(!nleft)) return orig_ret_vaddr; - if (ncopied != rasize) { + if (nleft != rasize) { pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, " "%%ip=%#lx\n", current->pid, regs->sp, regs->ip); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index da6b35a9826..49edf2dd361 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -147,7 +147,6 @@ SECTIONS _edata = .; } :data -#ifdef CONFIG_X86_64 . = ALIGN(PAGE_SIZE); __vvar_page = .; @@ -165,12 +164,15 @@ SECTIONS #undef __VVAR_KERNEL_LDS #undef EMIT_VVAR + /* + * Pad the rest of the page with zeros. Otherwise the loader + * can leave garbage here. + */ + . = __vvar_beginning_hack + PAGE_SIZE; } :data . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); -#endif /* CONFIG_X86_64 */ - /* Init code and data - will be freed after init */ . = ALIGN(PAGE_SIZE); .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index f6584a90aba..b99b9ad8540 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -26,6 +26,9 @@ #define TOPOLOGY_REGISTER_OFFSET 0x10 +/* Flag below is initialized once during vSMP PCI initialization. */ +static int irq_routing_comply = 1; + #if defined CONFIG_PCI && defined CONFIG_PARAVIRT /* * Interrupt control on vSMPowered systems: @@ -33,7 +36,7 @@ * and vice versa. */ -asmlinkage unsigned long vsmp_save_fl(void) +asmlinkage __visible unsigned long vsmp_save_fl(void) { unsigned long flags = native_save_fl(); @@ -53,7 +56,7 @@ __visible void vsmp_restore_fl(unsigned long flags) } PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl); -asmlinkage void vsmp_irq_disable(void) +asmlinkage __visible void vsmp_irq_disable(void) { unsigned long flags = native_save_fl(); @@ -61,7 +64,7 @@ asmlinkage void vsmp_irq_disable(void) } PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable); -asmlinkage void vsmp_irq_enable(void) +asmlinkage __visible void vsmp_irq_enable(void) { unsigned long flags = native_save_fl(); @@ -101,6 +104,10 @@ static void __init set_vsmp_pv_ops(void) #ifdef CONFIG_SMP if (cap & ctl & BIT(8)) { ctl &= ~BIT(8); + + /* Interrupt routing set to ignore */ + irq_routing_comply = 0; + #ifdef CONFIG_PROC_FS /* Don't let users change irq affinity via procfs */ no_irq_affinity = 1; @@ -218,7 +225,9 @@ static void vsmp_apic_post_init(void) { /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; - apic->vector_allocation_domain = fill_vector_allocation_domain; + + if (!irq_routing_comply) + apic->vector_allocation_domain = fill_vector_allocation_domain; } void __init vsmp_init(void) diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 1f96f9347ed..8b3b3eb3cea 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -47,14 +47,12 @@ #include <asm/segment.h> #include <asm/desc.h> #include <asm/topology.h> -#include <asm/vgtod.h> #include <asm/traps.h> #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" DEFINE_VVAR(int, vgetcpu_mode); -DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; @@ -77,48 +75,6 @@ static int __init vsyscall_setup(char *str) } early_param("vsyscall", vsyscall_setup); -void update_vsyscall_tz(void) -{ - vsyscall_gtod_data.sys_tz = sys_tz; -} - -void update_vsyscall(struct timekeeper *tk) -{ - struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; - - write_seqcount_begin(&vdata->seq); - - /* copy vsyscall data */ - vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; - vdata->clock.cycle_last = tk->clock->cycle_last; - vdata->clock.mask = tk->clock->mask; - vdata->clock.mult = tk->mult; - vdata->clock.shift = tk->shift; - - vdata->wall_time_sec = tk->xtime_sec; - vdata->wall_time_snsec = tk->xtime_nsec; - - vdata->monotonic_time_sec = tk->xtime_sec - + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->xtime_nsec - + (tk->wall_to_monotonic.tv_nsec - << tk->shift); - while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->shift)) { - vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->shift; - vdata->monotonic_time_sec++; - } - - vdata->wall_time_coarse.tv_sec = tk->xtime_sec; - vdata->wall_time_coarse.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); - - vdata->monotonic_time_coarse = timespec_add(vdata->wall_time_coarse, - tk->wall_to_monotonic); - - write_seqcount_end(&vdata->seq); -} - static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, const char *message) { @@ -374,7 +330,6 @@ void __init map_vsyscall(void) { extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - extern char __vvar_page; unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, @@ -393,9 +348,13 @@ static int __init vsyscall_init(void) { BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)); + cpu_notifier_register_begin(); + on_each_cpu(cpu_vsyscall_init, NULL, 1); /* notifier priority > KVM */ - hotcpu_notifier(cpu_vsyscall_notifier, 30); + __hotcpu_notifier(cpu_vsyscall_notifier, 30); + + cpu_notifier_register_done(); return 0; } diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c new file mode 100644 index 00000000000..9531fbb123b --- /dev/null +++ b/arch/x86/kernel/vsyscall_gtod.c @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Modified for x86 32 bit architecture by + * Stefani Seibold <stefani@seibold.net> + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + * + * Thanks to hpa@transmeta.com for some useful hint. + * Special thanks to Ingo Molnar for his early experience with + * a different vsyscall implementation for Linux/IA32 and for the name. + * + */ + +#include <linux/timekeeper_internal.h> +#include <asm/vgtod.h> +#include <asm/vvar.h> + +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); + +void update_vsyscall_tz(void) +{ + vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; + vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; +} + +void update_vsyscall(struct timekeeper *tk) +{ + struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + + gtod_write_begin(vdata); + + /* copy vsyscall data */ + vdata->vclock_mode = tk->clock->archdata.vclock_mode; + vdata->cycle_last = tk->clock->cycle_last; + vdata->mask = tk->clock->mask; + vdata->mult = tk->mult; + vdata->shift = tk->shift; + + vdata->wall_time_sec = tk->xtime_sec; + vdata->wall_time_snsec = tk->xtime_nsec; + + vdata->monotonic_time_sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_snsec = tk->xtime_nsec + + ((u64)tk->wall_to_monotonic.tv_nsec + << tk->shift); + while (vdata->monotonic_time_snsec >= + (((u64)NSEC_PER_SEC) << tk->shift)) { + vdata->monotonic_time_snsec -= + ((u64)NSEC_PER_SEC) << tk->shift; + vdata->monotonic_time_sec++; + } + + vdata->wall_time_coarse_sec = tk->xtime_sec; + vdata->wall_time_coarse_nsec = (long)(tk->xtime_nsec >> tk->shift); + + vdata->monotonic_time_coarse_sec = + vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_coarse_nsec = + vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; + + while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { + vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; + vdata->monotonic_time_coarse_sec++; + } + + gtod_write_end(vdata); +} diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c6976257eff..f47a104a749 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -28,7 +28,7 @@ static u32 xstate_required_size(u64 xstate_bv) int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; - xstate_bv &= ~XSTATE_FPSSE; + xstate_bv &= XSTATE_EXTEND_MASK; while (xstate_bv) { if (xstate_bv & 0x1) { u32 eax, ebx, ecx, edx; @@ -43,6 +43,16 @@ static u32 xstate_required_size(u64 xstate_bv) return ret; } +u64 kvm_supported_xcr0(void) +{ + u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0; + + if (!kvm_x86_ops->mpx_supported()) + xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR); + + return xcr0; +} + void kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -73,9 +83,9 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu) } else { vcpu->arch.guest_supported_xcr0 = (best->eax | ((u64)best->edx << 32)) & - host_xcr0 & KVM_SUPPORTED_XCR0; - vcpu->arch.guest_xstate_size = - xstate_required_size(vcpu->arch.guest_supported_xcr0); + kvm_supported_xcr0(); + vcpu->arch.guest_xstate_size = best->ebx = + xstate_required_size(vcpu->arch.xcr0); } kvm_pmu_cpuid_update(vcpu); @@ -210,13 +220,6 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags = 0; } -static bool supported_xcr0_bit(unsigned bit) -{ - u64 mask = ((u64)1 << bit); - - return mask & KVM_SUPPORTED_XCR0 & host_xcr0; -} - #define F(x) bit(X86_FEATURE_##x) static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, @@ -256,6 +259,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; + unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -263,7 +267,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(TSC) | F(MSR) | F(PAE) | F(MCE) | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | + F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) | 0 /* Reserved, DS, ACPI */ | F(MMX) | F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 0 /* HTT, TM, Reserved, PBE */; @@ -303,7 +307,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | f_invpcid | F(RTM); + F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | + F(ADX) | F(SMAP); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -436,16 +441,18 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } case 0xd: { int idx, i; + u64 supported = kvm_supported_xcr0(); - entry->eax &= host_xcr0 & KVM_SUPPORTED_XCR0; - entry->edx &= (host_xcr0 & KVM_SUPPORTED_XCR0) >> 32; + entry->eax &= supported; + entry->edx &= supported >> 32; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; for (idx = 1, i = 1; idx < 64; ++idx) { + u64 mask = ((u64)1 << idx); if (*nent >= maxnent) goto out; do_cpuid_1_ent(&entry[i], function, idx); - if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) + if (entry[i].eax == 0 || !(supported & mask)) continue; entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a2a1bb7ed8c..eeecbed26ac 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -48,6 +48,14 @@ static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_SMEP)); } +static inline bool guest_cpuid_has_smap(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_SMAP)); +} + static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 07ffca0a89e..205b17eed93 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3668,6 +3668,10 @@ static const struct gprefix pfx_vmovntpx = { I(0, em_mov), N, N, N, }; +static const struct gprefix pfx_0f_28_0f_29 = { + I(Aligned, em_mov), I(Aligned, em_mov), N, N, +}; + static const struct escape escape_d9 = { { N, N, N, N, N, N, N, I(DstMem, em_fnstcw), }, { @@ -3870,7 +3874,9 @@ static const struct opcode twobyte_table[256] = { IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), N, N, N, N, - N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), + GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), + GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), + N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), N, N, N, N, /* 0x30 - 0x3F */ II(ImplicitOps | Priv, em_wrmsr, wrmsr), diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9b531351a58..813d31038b9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3329,7 +3329,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.direct_map = vcpu->arch.mmu.direct_map; arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); - return kvm_setup_async_pf(vcpu, gva, gfn, &arch); + return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch); } static bool can_do_async_pf(struct kvm_vcpu *vcpu) @@ -3601,20 +3601,27 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, } } -static void update_permission_bitmask(struct kvm_vcpu *vcpu, +void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool ept) { unsigned bit, byte, pfec; u8 map; - bool fault, x, w, u, wf, uf, ff, smep; + bool fault, x, w, u, wf, uf, ff, smapf, cr4_smap, cr4_smep, smap = 0; - smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { pfec = byte << 1; map = 0; wf = pfec & PFERR_WRITE_MASK; uf = pfec & PFERR_USER_MASK; ff = pfec & PFERR_FETCH_MASK; + /* + * PFERR_RSVD_MASK bit is set in PFEC if the access is not + * subject to SMAP restrictions, and cleared otherwise. The + * bit is only meaningful if the SMAP bit is set in CR4. + */ + smapf = !(pfec & PFERR_RSVD_MASK); for (bit = 0; bit < 8; ++bit) { x = bit & ACC_EXEC_MASK; w = bit & ACC_WRITE_MASK; @@ -3626,12 +3633,33 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, /* Allow supervisor writes if !cr0.wp */ w |= !is_write_protection(vcpu) && !uf; /* Disallow supervisor fetches of user code if cr4.smep */ - x &= !(smep && u && !uf); + x &= !(cr4_smep && u && !uf); + + /* + * SMAP:kernel-mode data accesses from user-mode + * mappings should fault. A fault is considered + * as a SMAP violation if all of the following + * conditions are ture: + * - X86_CR4_SMAP is set in CR4 + * - An user page is accessed + * - Page fault in kernel mode + * - if CPL = 3 or X86_EFLAGS_AC is clear + * + * Here, we cover the first three conditions. + * The fourth is computed dynamically in + * permission_fault() and is in smapf. + * + * Also, SMAP does not affect instruction + * fetches, add the !ff check here to make it + * clearer. + */ + smap = cr4_smap && u && !uf && !ff; } else /* Not really needed: no U/S accesses on ept */ u = 1; - fault = (ff && !x) || (uf && !u) || (wf && !w); + fault = (ff && !x) || (uf && !u) || (wf && !w) || + (smapf && smap); map |= fault << bit; } mmu->permissions[byte] = map; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 29261527435..3842e70bdb7 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,11 +44,17 @@ #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) +#define PFERR_PRESENT_BIT 0 +#define PFERR_WRITE_BIT 1 +#define PFERR_USER_BIT 2 +#define PFERR_RSVD_BIT 3 +#define PFERR_FETCH_BIT 4 + +#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) +#define PFERR_USER_MASK (1U << PFERR_USER_BIT) +#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); @@ -73,6 +79,8 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly); +void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + bool ept); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { @@ -110,10 +118,30 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) * Will a fault with a given page-fault error code (pfec) cause a permission * fault with the given access (in ACC_* format)? */ -static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access, - unsigned pfec) +static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + unsigned pte_access, unsigned pfec) { - return (mmu->permissions[pfec >> 1] >> pte_access) & 1; + int cpl = kvm_x86_ops->get_cpl(vcpu); + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + + /* + * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1. + * + * If CPL = 3, SMAP applies to all supervisor-mode data accesses + * (these are implicit supervisor accesses) regardless of the value + * of EFLAGS.AC. + * + * This computes (cpl < 3) && (rflags & X86_EFLAGS_AC), leaving + * the result in X86_EFLAGS_AC. We then insert it in place of + * the PFERR_RSVD_MASK bit; this bit will always be zero in pfec, + * but it will be one in index if SMAP checks are being overridden. + * It is important to keep this branchless. + */ + unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); + int index = (pfec >> 1) + + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + + return (mmu->permissions[index] >> pte_access) & 1; } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index cba218a2f08..123efd3ec29 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -353,7 +353,7 @@ retry_walk: walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); - if (unlikely(permission_fault(mmu, pte_access, access))) { + if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) { errcode |= PFERR_PRESENT_MASK; goto error; } @@ -913,7 +913,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't * used by guest then tlbs are not flushed, so guest is allowed to access the * freed pages. - * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. + * We set tlbs_dirty to let the notifier know this change and delay the flush + * until such a case actually happens. */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -942,7 +943,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return -EINVAL; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - vcpu->kvm->tlbs_dirty++; + vcpu->kvm->tlbs_dirty = true; continue; } @@ -957,7 +958,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); - vcpu->kvm->tlbs_dirty++; + vcpu->kvm->tlbs_dirty = true; continue; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2de1bc09a8d..7f4f9c2bada 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -34,6 +34,7 @@ #include <asm/perf_event.h> #include <asm/tlbflush.h> #include <asm/desc.h> +#include <asm/debugreg.h> #include <asm/kvm_para.h> #include <asm/virtext.h> @@ -303,20 +304,35 @@ static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) return vmcb->control.intercept_cr & (1U << bit); } -static inline void set_dr_intercept(struct vcpu_svm *svm, int bit) +static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr |= (1U << bit); + vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ) + | (1 << INTERCEPT_DR1_READ) + | (1 << INTERCEPT_DR2_READ) + | (1 << INTERCEPT_DR3_READ) + | (1 << INTERCEPT_DR4_READ) + | (1 << INTERCEPT_DR5_READ) + | (1 << INTERCEPT_DR6_READ) + | (1 << INTERCEPT_DR7_READ) + | (1 << INTERCEPT_DR0_WRITE) + | (1 << INTERCEPT_DR1_WRITE) + | (1 << INTERCEPT_DR2_WRITE) + | (1 << INTERCEPT_DR3_WRITE) + | (1 << INTERCEPT_DR4_WRITE) + | (1 << INTERCEPT_DR5_WRITE) + | (1 << INTERCEPT_DR6_WRITE) + | (1 << INTERCEPT_DR7_WRITE); recalc_intercepts(svm); } -static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit) +static inline void clr_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr &= ~(1U << bit); + vmcb->control.intercept_dr = 0; recalc_intercepts(svm); } @@ -1080,23 +1096,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_cr_intercept(svm, INTERCEPT_CR4_WRITE); set_cr_intercept(svm, INTERCEPT_CR8_WRITE); - set_dr_intercept(svm, INTERCEPT_DR0_READ); - set_dr_intercept(svm, INTERCEPT_DR1_READ); - set_dr_intercept(svm, INTERCEPT_DR2_READ); - set_dr_intercept(svm, INTERCEPT_DR3_READ); - set_dr_intercept(svm, INTERCEPT_DR4_READ); - set_dr_intercept(svm, INTERCEPT_DR5_READ); - set_dr_intercept(svm, INTERCEPT_DR6_READ); - set_dr_intercept(svm, INTERCEPT_DR7_READ); - - set_dr_intercept(svm, INTERCEPT_DR0_WRITE); - set_dr_intercept(svm, INTERCEPT_DR1_WRITE); - set_dr_intercept(svm, INTERCEPT_DR2_WRITE); - set_dr_intercept(svm, INTERCEPT_DR3_WRITE); - set_dr_intercept(svm, INTERCEPT_DR4_WRITE); - set_dr_intercept(svm, INTERCEPT_DR5_WRITE); - set_dr_intercept(svm, INTERCEPT_DR6_WRITE); - set_dr_intercept(svm, INTERCEPT_DR7_WRITE); + set_dr_intercepts(svm); set_exception_intercept(svm, PF_VECTOR); set_exception_intercept(svm, UD_VECTOR); @@ -1684,6 +1684,21 @@ static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) mark_dirty(svm->vmcb, VMCB_DR); } +static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + get_debugreg(vcpu->arch.db[0], 0); + get_debugreg(vcpu->arch.db[1], 1); + get_debugreg(vcpu->arch.db[2], 2); + get_debugreg(vcpu->arch.db[3], 3); + vcpu->arch.dr6 = svm_get_dr6(vcpu); + vcpu->arch.dr7 = svm->vmcb->save.dr7; + + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + set_dr_intercepts(svm); +} + static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2842,6 +2857,7 @@ static int iret_interception(struct vcpu_svm *svm) clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); return 1; } @@ -2974,6 +2990,17 @@ static int dr_interception(struct vcpu_svm *svm) unsigned long val; int err; + if (svm->vcpu.guest_debug == 0) { + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + clr_dr_intercepts(svm); + svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) return emulate_on_interception(svm); @@ -3649,7 +3676,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) return ret; } -static int enable_irq_window(struct kvm_vcpu *vcpu) +static void enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3663,16 +3690,15 @@ static int enable_irq_window(struct kvm_vcpu *vcpu) svm_set_vintr(svm); svm_inject_irq(svm, 0x0); } - return 0; } -static int enable_nmi_window(struct kvm_vcpu *vcpu) +static void enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK) - return 0; /* IRET will cause a vm exit */ + return; /* IRET will cause a vm exit */ /* * Something prevents NMI from been injected. Single step over possible @@ -3681,7 +3707,6 @@ static int enable_nmi_window(struct kvm_vcpu *vcpu) svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); update_db_bp_intercept(vcpu); - return 0; } static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -4064,6 +4089,11 @@ static bool svm_invpcid_supported(void) return false; } +static bool svm_mpx_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -4302,6 +4332,7 @@ static struct kvm_x86_ops svm_x86_ops = { .get_dr6 = svm_get_dr6, .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, + .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, @@ -4345,6 +4376,7 @@ static struct kvm_x86_ops svm_x86_ops = { .rdtscp_supported = svm_rdtscp_supported, .invpcid_supported = svm_invpcid_supported, + .mpx_supported = svm_mpx_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 39275283475..138ceffc637 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -31,6 +31,7 @@ #include <linux/ftrace_event.h> #include <linux/slab.h> #include <linux/tboot.h> +#include <linux/hrtimer.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -42,6 +43,7 @@ #include <asm/i387.h> #include <asm/xcr.h> #include <asm/perf_event.h> +#include <asm/debugreg.h> #include <asm/kexec.h> #include "trace.h" @@ -110,6 +112,8 @@ module_param(nested, bool, S_IRUGO); #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) +#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -202,6 +206,7 @@ struct __packed vmcs12 { u64 guest_pdptr1; u64 guest_pdptr2; u64 guest_pdptr3; + u64 guest_bndcfgs; u64 host_ia32_pat; u64 host_ia32_efer; u64 host_ia32_perf_global_ctrl; @@ -374,6 +379,9 @@ struct nested_vmx { */ struct page *apic_access_page; u64 msr_ia32_feature_control; + + struct hrtimer preemption_timer; + bool preemption_timer_expired; }; #define POSTED_INTR_ON 0 @@ -441,6 +449,7 @@ struct vcpu_vmx { #endif int gs_ldt_reload_needed; int fs_reload_needed; + u64 msr_host_bndcfgs; } host_state; struct { int vm86_active; @@ -494,7 +503,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) [number##_HIGH] = VMCS12_OFFSET(name)+4 -static const unsigned long shadow_read_only_fields[] = { +static unsigned long shadow_read_only_fields[] = { /* * We do NOT shadow fields that are modified when L0 * traps and emulates any vmx instruction (e.g. VMPTRLD, @@ -517,10 +526,10 @@ static const unsigned long shadow_read_only_fields[] = { GUEST_LINEAR_ADDRESS, GUEST_PHYSICAL_ADDRESS }; -static const int max_shadow_read_only_fields = +static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); -static const unsigned long shadow_read_write_fields[] = { +static unsigned long shadow_read_write_fields[] = { GUEST_RIP, GUEST_RSP, GUEST_CR0, @@ -533,6 +542,7 @@ static const unsigned long shadow_read_write_fields[] = { GUEST_CS_LIMIT, GUEST_CS_BASE, GUEST_ES_BASE, + GUEST_BNDCFGS, CR0_GUEST_HOST_MASK, CR0_READ_SHADOW, CR4_READ_SHADOW, @@ -548,7 +558,7 @@ static const unsigned long shadow_read_write_fields[] = { HOST_FS_SELECTOR, HOST_GS_SELECTOR }; -static const int max_shadow_read_write_fields = +static int max_shadow_read_write_fields = ARRAY_SIZE(shadow_read_write_fields); static const unsigned short vmcs_field_to_offset_table[] = { @@ -588,6 +598,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(GUEST_PDPTR1, guest_pdptr1), FIELD64(GUEST_PDPTR2, guest_pdptr2), FIELD64(GUEST_PDPTR3, guest_pdptr3), + FIELD64(GUEST_BNDCFGS, guest_bndcfgs), FIELD64(HOST_IA32_PAT, host_ia32_pat), FIELD64(HOST_IA32_EFER, host_ia32_efer), FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), @@ -718,6 +729,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); +static bool vmx_mpx_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -728,6 +740,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); +static bool vmx_mpx_supported(void); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -1047,6 +1060,12 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } +static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); @@ -1710,6 +1729,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif + if (boot_cpu_has(X86_FEATURE_MPX)) + rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, @@ -1747,6 +1768,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif + if (vmx->host_state.msr_host_bndcfgs) + wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); /* * If the FPU is not active (through the host task or * the guest vcpu), then restore the cr0.TS bit. @@ -2248,9 +2271,9 @@ static __init void nested_vmx_setup_ctls_msrs(void) */ nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | - PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | + PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; + nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; /* * Exit controls @@ -2265,15 +2288,12 @@ static __init void nested_vmx_setup_ctls_msrs(void) #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; - if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) || - !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) { - nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; - nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - } - nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | - VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER); + if (vmx_mpx_supported()) + nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2287,6 +2307,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) VM_ENTRY_LOAD_IA32_PAT; nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); + if (vmx_mpx_supported()) + nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, @@ -2342,9 +2364,9 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); - nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | - VMX_MISC_SAVE_EFER_LMA; - nested_vmx_misc_low |= VMX_MISC_ACTIVITY_HLT; + nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; + nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | + VMX_MISC_ACTIVITY_HLT; nested_vmx_misc_high = 0; } @@ -2479,6 +2501,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_IA32_BNDCFGS: + if (!vmx_mpx_supported()) + return 1; + data = vmcs_read64(GUEST_BNDCFGS); + break; case MSR_IA32_FEATURE_CONTROL: if (!nested_vmx_allowed(vcpu)) return 1; @@ -2547,6 +2574,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: vmcs_writel(GUEST_SYSENTER_ESP, data); break; + case MSR_IA32_BNDCFGS: + if (!vmx_mpx_supported()) + return 1; + vmcs_write64(GUEST_BNDCFGS, data); + break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; @@ -2832,12 +2864,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmx_capability.ept, vmx_capability.vpid); } - min = 0; + min = VM_EXIT_SAVE_DEBUG_CONTROLS; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_ACK_INTR_ON_EXIT; + VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; @@ -2853,8 +2885,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; - min = 0; - opt = VM_ENTRY_LOAD_IA32_PAT; + min = VM_ENTRY_LOAD_DEBUG_CONTROLS; + opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; @@ -2977,6 +3009,41 @@ static void free_kvm_area(void) } } +static void init_vmcs_shadow_fields(void) +{ + int i, j; + + /* No checks for read only fields yet */ + + for (i = j = 0; i < max_shadow_read_write_fields; i++) { + switch (shadow_read_write_fields[i]) { + case GUEST_BNDCFGS: + if (!vmx_mpx_supported()) + continue; + break; + default: + break; + } + + if (j < i) + shadow_read_write_fields[j] = + shadow_read_write_fields[i]; + j++; + } + max_shadow_read_write_fields = j; + + /* shadowed fields guest access without vmexit */ + for (i = 0; i < max_shadow_read_write_fields; i++) { + clear_bit(shadow_read_write_fields[i], + vmx_vmwrite_bitmap); + clear_bit(shadow_read_write_fields[i], + vmx_vmread_bitmap); + } + for (i = 0; i < max_shadow_read_only_fields; i++) + clear_bit(shadow_read_only_fields[i], + vmx_vmread_bitmap); +} + static __init int alloc_kvm_area(void) { int cpu; @@ -3007,6 +3074,8 @@ static __init int hardware_setup(void) enable_vpid = 0; if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; + if (enable_shadow_vmcs) + init_vmcs_shadow_fields(); if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels()) { @@ -3452,13 +3521,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; /* - * SMEP is disabled if CPU is in non-paging mode in - * hardware. However KVM always uses paging mode to + * SMEP/SMAP is disabled if CPU is in non-paging mode + * in hardware. However KVM always uses paging mode to * emulate guest non-paging mode with TDP. - * To emulate this behavior, SMEP needs to be manually - * disabled when guest switches to non-paging mode. + * To emulate this behavior, SMEP/SMAP needs to be + * manually disabled when guest switches to non-paging + * mode. */ - hw_cr4 &= ~X86_CR4_SMEP; + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } @@ -4223,6 +4293,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; + + if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) + exec_control &= ~CPU_BASED_MOV_DR_EXITING; + if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { exec_control &= ~CPU_BASED_TPR_SHADOW; #ifdef CONFIG_X86_64 @@ -4496,39 +4570,28 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) PIN_BASED_NMI_EXITING; } -static int enable_irq_window(struct kvm_vcpu *vcpu) +static void enable_irq_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) - /* - * We get here if vmx_interrupt_allowed() said we can't - * inject to L1 now because L2 must run. The caller will have - * to make L2 exit right after entry, so we can inject to L1 - * more promptly. - */ - return -EBUSY; - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - return 0; } -static int enable_nmi_window(struct kvm_vcpu *vcpu) +static void enable_nmi_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (!cpu_has_virtual_nmis()) - return enable_irq_window(vcpu); - - if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) - return enable_irq_window(vcpu); + if (!cpu_has_virtual_nmis() || + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + enable_irq_window(vcpu); + return; + } cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - return 0; } static void vmx_inject_irq(struct kvm_vcpu *vcpu) @@ -4620,22 +4683,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) { - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - if (nested_exit_on_nmi(vcpu)) { - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - NMI_VECTOR | INTR_TYPE_NMI_INTR | - INTR_INFO_VALID_MASK, 0); - /* - * The NMI-triggered VM exit counts as injection: - * clear this one and block further NMIs. - */ - vcpu->arch.nmi_pending = 0; - vmx_set_nmi_mask(vcpu, true); - return 0; - } - } + if (to_vmx(vcpu)->nested.nested_run_pending) + return 0; if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) return 0; @@ -4647,19 +4696,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) { - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - if (nested_exit_on_intr(vcpu)) { - nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, - 0, 0); - /* - * fall through to normal code, but now in L1, not L2 - */ - } - } - - return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + return (!to_vmx(vcpu)->nested.nested_run_pending && + vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } @@ -5102,6 +5140,22 @@ static int handle_dr(struct kvm_vcpu *vcpu) } } + if (vcpu->guest_debug == 0) { + u32 cpu_based_vm_exec_control; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; reg = DEBUG_REG_ACCESS_REG(exit_qualification); @@ -5128,6 +5182,24 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) { } +static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + get_debugreg(vcpu->arch.db[0], 0); + get_debugreg(vcpu->arch.db[1], 1); + get_debugreg(vcpu->arch.db[2], 2); + get_debugreg(vcpu->arch.db[3], 3); + get_debugreg(vcpu->arch.dr6, 6); + vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); + + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -5727,6 +5799,18 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, */ } +static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) +{ + struct vcpu_vmx *vmx = + container_of(timer, struct vcpu_vmx, nested.preemption_timer); + + vmx->nested.preemption_timer_expired = true; + kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + kvm_vcpu_kick(&vmx->vcpu); + + return HRTIMER_NORESTART; +} + /* * Emulate the VMXON instruction. * Currently, we just remember that VMX is active, and do not save or even @@ -5791,6 +5875,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu) INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); vmx->nested.vmcs02_num = 0; + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + vmx->nested.vmxon = true; skip_emulated_instruction(vcpu); @@ -6767,9 +6855,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * table is L0's fault. */ return 0; - case EXIT_REASON_PREEMPTION_TIMER: - return vmcs12->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER; case EXIT_REASON_WBINVD: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: @@ -6785,27 +6870,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } -static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu) -{ - u64 delta_tsc_l1; - u32 preempt_val_l1, preempt_val_l2, preempt_scale; - - if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER)) - return; - preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) & - MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE; - preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE); - delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc()) - - vcpu->arch.last_guest_tsc; - preempt_val_l1 = delta_tsc_l1 >> preempt_scale; - if (preempt_val_l2 <= preempt_val_l1) - preempt_val_l2 = 0; - else - preempt_val_l2 -= preempt_val_l1; - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2); -} - /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -7052,6 +7116,12 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) local_irq_enable(); } +static bool vmx_mpx_supported(void) +{ + return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && + (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -7218,8 +7288,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); - if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) - nested_adjust_preemption_timer(vcpu); vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ @@ -7616,6 +7684,28 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, kvm_inject_page_fault(vcpu, fault); } +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) +{ + u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vcpu->arch.virtual_tsc_khz == 0) + return; + + /* Make sure short timeouts reliably trigger an immediate vmexit. + * hrtimer_start does not guarantee this. */ + if (preemption_timeout <= 1) { + vmx_preemption_timer_fn(&vmx->nested.preemption_timer); + return; + } + + preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + preemption_timeout *= 1000000; + do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); + hrtimer_start(&vmx->nested.preemption_timer, + ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -7629,7 +7719,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; - u32 exit_control; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -7687,13 +7776,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VMCS_LINK_POINTER, -1ull); - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - (vmcs_config.pin_based_exec_ctrl | - vmcs12->pin_based_vm_exec_control)); + exec_control = vmcs12->pin_based_vm_exec_control; + exec_control |= vmcs_config.pin_based_exec_ctrl; + exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER | + PIN_BASED_POSTED_INTR); + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); - if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, - vmcs12->vmx_preemption_timer_value); + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); /* * Whether page-faults are trapped is determined by a combination of @@ -7721,11 +7812,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) enable_ept ? vmcs12->page_fault_error_code_match : 0); if (cpu_has_secondary_exec_ctrls()) { - u32 exec_control = vmx_secondary_exec_control(vmx); + exec_control = vmx_secondary_exec_control(vmx); if (!vmx->rdtscp_enabled) exec_control &= ~SECONDARY_EXEC_RDTSCP; /* Take the following fields only from vmcs12 */ - exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_APIC_REGISTER_VIRT); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -7808,10 +7901,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits are further modified by vmx_set_efer() below. */ - exit_control = vmcs_config.vmexit_ctrl; - if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) - exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; - vm_exit_controls_init(vmx, exit_control); + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are * emulated by vmx_set_efer(), below. @@ -7830,6 +7920,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) set_cr4_guest_host_mask(vmx); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); @@ -8155,6 +8248,58 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } } +static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && + vmx->nested.preemption_timer_expired) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); + return 0; + } + + if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { + if (vmx->nested.nested_run_pending || + vcpu->arch.interrupt.pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + NMI_VECTOR | INTR_TYPE_NMI_INTR | + INTR_INFO_VALID_MASK, 0); + /* + * The NMI-triggered VM exit counts as injection: + * clear this one and block further NMIs. + */ + vcpu->arch.nmi_pending = 0; + vmx_set_nmi_mask(vcpu, true); + return 0; + } + + if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && + nested_exit_on_intr(vcpu)) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + } + + return 0; +} + +static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) +{ + ktime_t remaining = + hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); + u64 value; + + if (ktime_to_ns(remaining) <= 0) + return 0; + + value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; + do_div(value, 1000000); + return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; +} + /* * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), @@ -8225,10 +8370,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, else vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; - if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) && - (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) - vmcs12->vmx_preemption_timer_value = - vmcs_read32(VMX_PREEMPTION_TIMER_VALUE); + if (nested_cpu_has_preemption_timer(vmcs12)) { + if (vmcs12->vm_exit_controls & + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) + vmcs12->vmx_preemption_timer_value = + vmx_get_preemption_timer_value(vcpu); + hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + } /* * In some cases (usually, nested EPT), L2 is allowed to change its @@ -8260,6 +8408,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + if (vmx_mpx_supported()) + vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); /* update exit information fields: */ @@ -8369,6 +8519,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ + if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, 0); + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; @@ -8495,6 +8649,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, nested_vmx_succeed(vcpu); if (enable_shadow_vmcs) vmx->nested.sync_shadow_vmcs = true; + + /* in case we halted in L2 */ + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } /* @@ -8573,6 +8730,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_dr6 = vmx_get_dr6, .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, @@ -8634,6 +8792,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_intercept = vmx_check_intercept, .handle_external_intr = vmx_handle_external_intr, + .mpx_supported = vmx_mpx_supported, + + .check_nested_events = vmx_check_nested_events, }; static int __init vmx_init(void) @@ -8682,14 +8843,6 @@ static int __init vmx_init(void) memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - /* shadowed read/write fields */ - for (i = 0; i < max_shadow_read_write_fields; i++) { - clear_bit(shadow_read_write_fields[i], vmx_vmwrite_bitmap); - clear_bit(shadow_read_write_fields[i], vmx_vmread_bitmap); - } - /* shadowed read only fields */ - for (i = 0; i < max_shadow_read_only_fields; i++) - clear_bit(shadow_read_only_fields[i], vmx_vmread_bitmap); /* * Allow direct access to the PC debug port (it is often used for I/O @@ -8721,6 +8874,8 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); + memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); memcpy(vmx_msr_bitmap_longmode_x2apic, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b8578432d5..20316c67b82 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -106,6 +106,8 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); static u32 tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); +static bool backwards_tsc_observed = false; + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -280,7 +282,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage void kvm_spurious_fault(void) +asmlinkage __visible void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG(); @@ -595,13 +597,13 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - u64 xcr0; + u64 xcr0 = xcr; + u64 old_xcr0 = vcpu->arch.xcr0; u64 valid_bits; /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ if (index != XCR_XFEATURE_ENABLED_MASK) return 1; - xcr0 = xcr; if (!(xcr0 & XSTATE_FP)) return 1; if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) @@ -616,8 +618,14 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if (xcr0 & ~valid_bits) return 1; + if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR))) + return 1; + kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; + + if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK) + kvm_update_cpuid(vcpu); return 0; } @@ -646,6 +654,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) return 1; + if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP)) + return 1; + if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE)) return 1; @@ -674,6 +685,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); + if ((cr4 ^ old_cr4) & X86_CR4_SMAP) + update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false); + if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) kvm_update_cpuid(vcpu); @@ -753,7 +767,9 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu) else dr7 = vcpu->arch.dr7; kvm_x86_ops->set_dr7(vcpu, dr7); - vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK); + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; + if (dr7 & DR7_BP_EN_MASK) + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) @@ -879,7 +895,7 @@ static u32 msrs_to_save[] = { MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEATURE_CONTROL + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS }; static unsigned num_msrs_to_save; @@ -1109,7 +1125,6 @@ static inline u64 get_kernel_ns(void) { struct timespec ts; - WARN_ON(preemptible()); ktime_get_ts(&ts); monotonic_to_bootbased(&ts); return timespec_to_ns(&ts); @@ -1473,7 +1488,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) &ka->master_kernel_ns, &ka->master_cycle_now); - ka->use_master_clock = host_tsc_clocksource & vcpus_matched; + ka->use_master_clock = host_tsc_clocksource && vcpus_matched + && !backwards_tsc_observed; if (ka->use_master_clock) atomic_set(&kvm_guest_has_master_clock, 1); @@ -1581,7 +1597,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; - vcpu->last_kernel_ns = kernel_ns; vcpu->last_guest_tsc = tsc_timestamp; /* @@ -1623,14 +1638,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * the others. * * So in those cases, request a kvmclock update for all vcpus. - * The worst case for a remote vcpu to update its kvmclock - * is then bounded by maximum nohz sleep latency. + * We need to rate-limit these requests though, as they can + * considerably slow guests that have a large number of vcpus. + * The time for a remote vcpu to update its kvmclock is bound + * by the delay we use to rate-limit the updates. */ -static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) + +static void kvmclock_update_fn(struct work_struct *work) { int i; - struct kvm *kvm = v->kvm; + struct delayed_work *dwork = to_delayed_work(work); + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, + kvmclock_update_work); + struct kvm *kvm = container_of(ka, struct kvm, arch); struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { @@ -1639,6 +1661,29 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) } } +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +{ + struct kvm *kvm = v->kvm; + + set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests); + schedule_delayed_work(&kvm->arch.kvmclock_update_work, + KVMCLOCK_UPDATE_DELAY); +} + +#define KVMCLOCK_SYNC_PERIOD (300 * HZ) + +static void kvmclock_sync_fn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, + kvmclock_sync_work); + struct kvm *kvm = container_of(ka, struct kvm, arch); + + schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -2323,9 +2368,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_VP_INDEX: { int r; struct kvm_vcpu *v; - kvm_for_each_vcpu(r, v, vcpu->kvm) - if (v == vcpu) + kvm_for_each_vcpu(r, v, vcpu->kvm) { + if (v == vcpu) { data = r; + break; + } + } break; } case HV_X64_MSR_EOI: @@ -2617,6 +2665,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: case KVM_CAP_HYPERV_TIME: + case KVM_CAP_IOAPIC_POLARITY_IGNORED: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -3043,9 +3092,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility * with old userspace. */ - if (xstate_bv & ~KVM_SUPPORTED_XCR0) - return -EINVAL; - if (xstate_bv & ~host_xcr0) + if (xstate_bv & ~kvm_supported_xcr0()) return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state->xsave, guest_xsave->region, vcpu->arch.guest_xstate_size); @@ -3898,6 +3945,23 @@ static void kvm_init_msr_list(void) for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue; + + /* + * Even MSRs that are valid in the host may not be exposed + * to the guests in some cases. We could work around this + * in VMX with the generic MSR save/load machinery, but it + * is not really worthwhile since it will really only + * happen with nested virtualization. + */ + switch (msrs_to_save[i]) { + case MSR_IA32_BNDCFGS: + if (!kvm_x86_ops->mpx_supported()) + continue; + break; + default: + break; + } + if (j < i) msrs_to_save[j] = msrs_to_save[i]; j++; @@ -4108,7 +4172,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, | (write ? PFERR_WRITE_MASK : 0); if (vcpu_match_mmio_gva(vcpu, gva) - && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) { + && !permission_fault(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.access, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -4394,6 +4459,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (!exchanged) return X86EMUL_CMPXCHG_FAILED; + mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); kvm_mmu_pte_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; @@ -5365,7 +5431,8 @@ static void kvm_timer_init(void) int cpu; max_tsc_khz = tsc_khz; - register_hotcpu_notifier(&kvmclock_cpu_notifier_block); + + cpu_notifier_register_begin(); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { #ifdef CONFIG_CPU_FREQ struct cpufreq_policy policy; @@ -5382,6 +5449,10 @@ static void kvm_timer_init(void) pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); for_each_online_cpu(cpu) smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); + + __register_hotcpu_notifier(&kvmclock_cpu_notifier_block); + cpu_notifier_register_done(); + } static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); @@ -5537,9 +5608,10 @@ int kvm_arch_init(void *opaque) goto out_free_percpu; kvm_set_mmio_spte_mask(); - kvm_init_msr_list(); kvm_x86_ops = ops; + kvm_init_msr_list(); + kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); @@ -5782,8 +5854,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); } -static void inject_pending_event(struct kvm_vcpu *vcpu) +static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) { + int r; + /* try to reinject previous events if any */ if (vcpu->arch.exception.pending) { trace_kvm_inj_exception(vcpu->arch.exception.nr, @@ -5793,17 +5867,23 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code, vcpu->arch.exception.reinject); - return; + return 0; } if (vcpu->arch.nmi_injected) { kvm_x86_ops->set_nmi(vcpu); - return; + return 0; } if (vcpu->arch.interrupt.pending) { kvm_x86_ops->set_irq(vcpu); - return; + return 0; + } + + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { + r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + if (r != 0) + return r; } /* try to inject new event if pending */ @@ -5820,6 +5900,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) kvm_x86_ops->set_irq(vcpu); } } + return 0; } static void process_nmi(struct kvm_vcpu *vcpu) @@ -5924,15 +6005,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } - inject_pending_event(vcpu); - + if (inject_pending_event(vcpu, req_int_win) != 0) + req_immediate_exit = true; /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) - req_immediate_exit = - kvm_x86_ops->enable_nmi_window(vcpu) != 0; + else if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - req_immediate_exit = - kvm_x86_ops->enable_irq_window(vcpu) != 0; + kvm_x86_ops->enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { /* @@ -5992,12 +6071,28 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); + set_debugreg(vcpu->arch.dr6, 6); } trace_kvm_entry(vcpu->vcpu_id); kvm_x86_ops->run(vcpu); /* + * Do this here before restoring debug registers on the host. And + * since we do this before handling the vmexit, a DR access vmexit + * can (a) read the correct value of the debug registers, (b) set + * KVM_DEBUGREG_WONT_EXIT again. + */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { + int i; + + WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); + kvm_x86_ops->sync_dirty_debug_regs(vcpu); + for (i = 0; i < KVM_NR_DB_REGS; i++) + vcpu->arch.eff_db[i] = vcpu->arch.db[i]; + } + + /* * If the guest has used debug registers, at least dr7 * will be disabled while returning to the host. * If we don't have active breakpoints in the host, we don't @@ -6711,6 +6806,7 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { int r; struct msr_data msr; + struct kvm *kvm = vcpu->kvm; r = vcpu_load(vcpu); if (r) @@ -6721,6 +6817,9 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); + return r; } @@ -6849,6 +6948,7 @@ int kvm_arch_hardware_enable(void *garbage) */ if (backwards_tsc) { u64 delta_cyc = max_tsc - local_tsc; + backwards_tsc_observed = true; list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.tsc_offset_adjustment += delta_cyc; @@ -7013,6 +7113,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) pvclock_update_vm_gtod_copy(kvm); + INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); + INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); + return 0; } @@ -7050,6 +7153,8 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_sync_events(struct kvm *kvm) { + cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); + cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); } @@ -7248,6 +7353,9 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) + kvm_x86_ops->check_nested_events(vcpu, false); + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 8da5823bcde..8c97bac9a89 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -122,9 +122,12 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); -#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) +#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ + | XSTATE_BNDREGS | XSTATE_BNDCSR) extern u64 host_xcr0; +extern u64 kvm_supported_xcr0(void); + extern unsigned int min_timer_period_us; extern struct static_key kvm_no_apic_vcpu; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index ad1fb5f5392..aae94132bc2 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -233,13 +233,13 @@ static void lguest_end_context_switch(struct task_struct *next) * flags word contains all kind of stuff, but in practice Linux only cares * about the interrupt flag. Our "save_flags()" just returns that. */ -asmlinkage unsigned long lguest_save_fl(void) +asmlinkage __visible unsigned long lguest_save_fl(void) { return lguest_data.irq_enabled; } /* Interrupts go off... */ -asmlinkage void lguest_irq_disable(void) +asmlinkage __visible void lguest_irq_disable(void) { lguest_data.irq_enabled = 0; } diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index eabcb6e6a90..4d4f96a2763 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -16,7 +16,7 @@ clean-files := inat-tables.c obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o -lib-y := delay.o misc.o +lib-y := delay.o misc.o cmdline.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c new file mode 100644 index 00000000000..422db000d72 --- /dev/null +++ b/arch/x86/lib/cmdline.c @@ -0,0 +1,84 @@ +/* + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * Misc librarized functions for cmdline poking. + */ +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <asm/setup.h> + +static inline int myisspace(u8 c) +{ + return c <= ' '; /* Close enough approximation */ +} + +/** + * Find a boolean option (like quiet,noapic,nosmp....) + * + * @cmdline: the cmdline string + * @option: option string to look for + * + * Returns the position of that @option (starts counting with 1) + * or 0 on not found. + */ +int cmdline_find_option_bool(const char *cmdline, const char *option) +{ + char c; + int len, pos = 0, wstart = 0; + const char *opptr = NULL; + enum { + st_wordstart = 0, /* Start of word/after whitespace */ + st_wordcmp, /* Comparing this word */ + st_wordskip, /* Miscompare, skip */ + } state = st_wordstart; + + if (!cmdline) + return -1; /* No command line */ + + len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE); + if (!len) + return 0; + + while (len--) { + c = *(char *)cmdline++; + pos++; + + switch (state) { + case st_wordstart: + if (!c) + return 0; + else if (myisspace(c)) + break; + + state = st_wordcmp; + opptr = option; + wstart = pos; + /* fall through */ + + case st_wordcmp: + if (!*opptr) + if (!c || myisspace(c)) + return wstart; + else + state = st_wordskip; + else if (!c) + return 0; + else if (c != *opptr++) + state = st_wordskip; + else if (!len) /* last word and is matching */ + return wstart; + break; + + case st_wordskip: + if (!c) + return 0; + else if (myisspace(c)) + state = st_wordstart; + break; + } + } + + return 0; /* Buffer overrun */ +} diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index db9db446b71..43623739c7c 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -76,7 +76,7 @@ static inline int __flip_bit(u32 msr, u8 bit, bool set) if (m1.q == m.q) return 0; - err = msr_write(msr, &m); + err = msr_write(msr, &m1); if (err) return err; diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c index a5449089cd9..9e6545f269e 100644 --- a/arch/x86/math-emu/errors.c +++ b/arch/x86/math-emu/errors.c @@ -302,7 +302,7 @@ static struct { 0x242 in div_Xsig.S */ -asmlinkage void FPU_exception(int n) +asmlinkage __visible void FPU_exception(int n) { int i, int_type; @@ -492,7 +492,7 @@ int real_2op_NaN(FPU_REG const *b, u_char tagb, /* Invalid arith operation on Valid registers */ /* Returns < 0 if the exception is unmasked */ -asmlinkage int arith_invalid(int deststnr) +asmlinkage __visible int arith_invalid(int deststnr) { EXCEPTION(EX_Invalid); @@ -507,7 +507,7 @@ asmlinkage int arith_invalid(int deststnr) } /* Divide a finite number by zero */ -asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign) +asmlinkage __visible int FPU_divide_by_zero(int deststnr, u_char sign) { FPU_REG *dest = &st(deststnr); int tag = TAG_Valid; @@ -539,7 +539,7 @@ int set_precision_flag(int flags) } /* This may be called often, so keep it lean */ -asmlinkage void set_precision_flag_up(void) +asmlinkage __visible void set_precision_flag_up(void) { if (control_word & CW_Precision) partial_status |= (SW_Precision | SW_C1); /* The masked response */ @@ -548,7 +548,7 @@ asmlinkage void set_precision_flag_up(void) } /* This may be called often, so keep it lean */ -asmlinkage void set_precision_flag_down(void) +asmlinkage __visible void set_precision_flag_down(void) { if (control_word & CW_Precision) { /* The masked response */ partial_status &= ~SW_C1; @@ -557,7 +557,7 @@ asmlinkage void set_precision_flag_down(void) EXCEPTION(EX_Precision); } -asmlinkage int denormal_operand(void) +asmlinkage __visible int denormal_operand(void) { if (control_word & CW_Denormal) { /* The masked response */ partial_status |= SW_Denorm_Op; @@ -568,7 +568,7 @@ asmlinkage int denormal_operand(void) } } -asmlinkage int arith_overflow(FPU_REG *dest) +asmlinkage __visible int arith_overflow(FPU_REG *dest) { int tag = TAG_Valid; @@ -596,7 +596,7 @@ asmlinkage int arith_overflow(FPU_REG *dest) } -asmlinkage int arith_underflow(FPU_REG *dest) +asmlinkage __visible int arith_underflow(FPU_REG *dest) { int tag = TAG_Valid; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 799580cabc7..bc7527e109c 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -50,6 +50,21 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, return err; } +static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, + void *arg) +{ + unsigned long i; + + for (i = 0; i < nr_pages; ++i) + if (pfn_valid(start_pfn + i) && + !PageReserved(pfn_to_page(start_pfn + i))) + return 1; + + WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn); + + return 0; +} + /* * Remap an arbitrary physical address space into the kernel virtual * address space. Needed when the kernel wants to access high addresses @@ -93,14 +108,11 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, /* * Don't allow anybody to remap normal RAM that we're using.. */ + pfn = phys_addr >> PAGE_SHIFT; last_pfn = last_addr >> PAGE_SHIFT; - for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) { - int is_ram = page_is_ram(pfn); - - if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) - return NULL; - WARN_ON_ONCE(is_ram); - } + if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, + __ioremap_check_ram) == 1) + return NULL; /* * Mappings have to be page-aligned @@ -328,17 +340,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr) return; } -static int __initdata early_ioremap_debug; - -static int __init early_ioremap_debug_setup(char *str) -{ - early_ioremap_debug = 1; - - return 0; -} -early_param("early_ioremap_debug", early_ioremap_debug_setup); - -static __initdata int after_paging_init; static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) @@ -362,18 +363,11 @@ bool __init is_early_ioremap_ptep(pte_t *ptep) return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)]; } -static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; - void __init early_ioremap_init(void) { pmd_t *pmd; - int i; - if (early_ioremap_debug) - printk(KERN_INFO "early_ioremap_init()\n"); - - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) - slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); + early_ioremap_setup(); pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); memset(bm_pte, 0, sizeof(bm_pte)); @@ -402,13 +396,8 @@ void __init early_ioremap_init(void) } } -void __init early_ioremap_reset(void) -{ - after_paging_init = 1; -} - -static void __init __early_set_fixmap(enum fixed_addresses idx, - phys_addr_t phys, pgprot_t flags) +void __init __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) { unsigned long addr = __fix_to_virt(idx); pte_t *pte; @@ -425,198 +414,3 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, pte_clear(&init_mm, addr, pte); __flush_tlb_one(addr); } - -static inline void __init early_set_fixmap(enum fixed_addresses idx, - phys_addr_t phys, pgprot_t prot) -{ - if (after_paging_init) - __set_fixmap(idx, phys, prot); - else - __early_set_fixmap(idx, phys, prot); -} - -static inline void __init early_clear_fixmap(enum fixed_addresses idx) -{ - if (after_paging_init) - clear_fixmap(idx); - else - __early_set_fixmap(idx, 0, __pgprot(0)); -} - -static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; -static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; - -void __init fixup_early_ioremap(void) -{ - int i; - - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { - if (prev_map[i]) { - WARN_ON(1); - break; - } - } - - early_ioremap_init(); -} - -static int __init check_early_ioremap_leak(void) -{ - int count = 0; - int i; - - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) - if (prev_map[i]) - count++; - - if (!count) - return 0; - WARN(1, KERN_WARNING - "Debug warning: early ioremap leak of %d areas detected.\n", - count); - printk(KERN_WARNING - "please boot with early_ioremap_debug and report the dmesg.\n"); - - return 1; -} -late_initcall(check_early_ioremap_leak); - -static void __init __iomem * -__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) -{ - unsigned long offset; - resource_size_t last_addr; - unsigned int nrpages; - enum fixed_addresses idx; - int i, slot; - - WARN_ON(system_state != SYSTEM_BOOTING); - - slot = -1; - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { - if (!prev_map[i]) { - slot = i; - break; - } - } - - if (slot < 0) { - printk(KERN_INFO "%s(%08llx, %08lx) not found slot\n", - __func__, (u64)phys_addr, size); - WARN_ON(1); - return NULL; - } - - if (early_ioremap_debug) { - printk(KERN_INFO "%s(%08llx, %08lx) [%d] => ", - __func__, (u64)phys_addr, size, slot); - dump_stack(); - } - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) { - WARN_ON(1); - return NULL; - } - - prev_size[slot] = size; - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr + 1) - phys_addr; - - /* - * Mappings have to fit in the FIX_BTMAP area. - */ - nrpages = size >> PAGE_SHIFT; - if (nrpages > NR_FIX_BTMAPS) { - WARN_ON(1); - return NULL; - } - - /* - * Ok, go for it.. - */ - idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; - while (nrpages > 0) { - early_set_fixmap(idx, phys_addr, prot); - phys_addr += PAGE_SIZE; - --idx; - --nrpages; - } - if (early_ioremap_debug) - printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]); - - prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); - return prev_map[slot]; -} - -/* Remap an IO device */ -void __init __iomem * -early_ioremap(resource_size_t phys_addr, unsigned long size) -{ - return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); -} - -/* Remap memory */ -void __init __iomem * -early_memremap(resource_size_t phys_addr, unsigned long size) -{ - return __early_ioremap(phys_addr, size, PAGE_KERNEL); -} - -void __init early_iounmap(void __iomem *addr, unsigned long size) -{ - unsigned long virt_addr; - unsigned long offset; - unsigned int nrpages; - enum fixed_addresses idx; - int i, slot; - - slot = -1; - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { - if (prev_map[i] == addr) { - slot = i; - break; - } - } - - if (slot < 0) { - printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n", - addr, size); - WARN_ON(1); - return; - } - - if (prev_size[slot] != size) { - printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", - addr, size, slot, prev_size[slot]); - WARN_ON(1); - return; - } - - if (early_ioremap_debug) { - printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, - size, slot); - dump_stack(); - } - - virt_addr = (unsigned long)addr; - if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) { - WARN_ON(1); - return; - } - offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; - - idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; - while (nrpages > 0) { - early_clear_fixmap(idx); - --idx; - --nrpages; - } - prev_map[slot] = NULL; -} diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index d87dd6d042d..dd89a13f105 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -78,10 +78,16 @@ early_initcall(kmemcheck_init); */ static int __init param_kmemcheck(char *str) { + int val; + int ret; + if (!str) return -EINVAL; - sscanf(str, "%d", &kmemcheck_enabled); + ret = kstrtoint(str, 0, &val); + if (ret) + return ret; + kmemcheck_enabled = val; return 0; } diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 27aa0455fab..1d045f9c390 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -687,10 +687,6 @@ static int __init dummy_numa_init(void) void __init x86_numa_init(void) { if (!numa_off) { -#ifdef CONFIG_X86_NUMAQ - if (!numa_init(numaq_numa_init)) - return; -#endif #ifdef CONFIG_ACPI_NUMA if (!numa_init(x86_acpi_numa_init)) return; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index c96314abd14..0004ac72dbd 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -399,13 +399,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - int young; - - young = ptep_test_and_clear_young(vma, address, ptep); - if (young) - flush_tlb_page(vma, address); - - return young; + /* + * On x86 CPUs, clearing the accessed bit without a TLB flush + * doesn't cause data corruption. [ It could cause incorrect + * page aging and the (mistaken) reclaim of hot pages, but the + * chance of that should be relatively low. ] + * + * So as a performance optimization don't flush the TLB when + * clearing the accessed bit, it will eventually be flushed by + * a context switch or a VM operation anyway. [ In the rare + * event of it not getting flushed for a long time the delay + * shouldn't really matter because there's no real memory + * pressure for swapout to react to. ] + */ + return ptep_test_and_clear_young(vma, address, ptep); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index a69bcb8c762..4dd8cf65257 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -127,7 +127,7 @@ static int __init parse_reservetop(char *arg) address = memparse(arg, &arg); reserve_top_address(address); - fixup_early_ioremap(); + early_ioremap_init(); return 0; } early_param("reservetop", parse_reservetop); diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 1953e9c9391..66338a60aa6 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -52,12 +52,18 @@ void __init acpi_numa_slit_init(struct acpi_table_slit *slit) int i, j; for (i = 0; i < slit->locality_count; i++) { - if (pxm_to_node(i) == NUMA_NO_NODE) + const int from_node = pxm_to_node(i); + + if (from_node == NUMA_NO_NODE) continue; + for (j = 0; j < slit->locality_count; j++) { - if (pxm_to_node(j) == NUMA_NO_NODE) + const int to_node = pxm_to_node(j); + + if (to_node == NUMA_NO_NODE) continue; - numa_set_distance(pxm_to_node(i), pxm_to_node(j), + + numa_set_distance(from_node, to_node, slit->entry[slit->locality_count * i + j]); } } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4ed75dd81d0..6d5663a599a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -171,7 +171,7 @@ static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen, memset(header, 0xcc, sz); /* fill whole space with int3 instructions */ header->pages = sz / PAGE_SIZE; - hole = sz - (proglen + sizeof(*header)); + hole = min(sz - (proglen + sizeof(*header)), PAGE_SIZE - sizeof(*header)); /* insert a random number of int3 instructions before BPF code */ *image_ptr = &header->image[prandom_u32() % hole]; @@ -553,13 +553,13 @@ void bpf_jit_compile(struct sk_filter *fp) } break; case BPF_S_ANC_RXHASH: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, rxhash) != 4); - if (is_imm8(offsetof(struct sk_buff, rxhash))) { + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); + if (is_imm8(offsetof(struct sk_buff, hash))) { /* mov off8(%rdi),%eax */ - EMIT3(0x8b, 0x47, offsetof(struct sk_buff, rxhash)); + EMIT3(0x8b, 0x47, offsetof(struct sk_buff, hash)); } else { EMIT2(0x8b, 0x87); - EMIT(offsetof(struct sk_buff, rxhash), 4); + EMIT(offsetof(struct sk_buff, hash), 4); } break; case BPF_S_ANC_QUEUE: @@ -772,6 +772,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i]; bpf_flush_icache(header, image + proglen); set_memory_ro((unsigned long)header, header->pages); fp->bpf_func = (void *)image; + fp->jited = 1; } out: kfree(addrs); @@ -791,7 +792,7 @@ static void bpf_jit_free_deferred(struct work_struct *work) void bpf_jit_free(struct sk_filter *fp) { - if (fp->bpf_func != sk_run_filter) { + if (fp->jited) { INIT_WORK(&fp->work, bpf_jit_free_deferred); schedule_work(&fp->work); } else { diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 6890d8498e0..379e8bd0dee 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -494,14 +494,19 @@ static int nmi_setup(void) if (err) goto fail; + cpu_notifier_register_begin(); + + /* Use get/put_online_cpus() to protect 'nmi_enabled' */ get_online_cpus(); - register_cpu_notifier(&oprofile_cpu_nb); nmi_enabled = 1; /* make nmi_enabled visible to the nmi handler: */ smp_mb(); on_each_cpu(nmi_cpu_setup, NULL, 1); + __register_cpu_notifier(&oprofile_cpu_nb); put_online_cpus(); + cpu_notifier_register_done(); + return 0; fail: free_msrs(); @@ -512,12 +517,18 @@ static void nmi_shutdown(void) { struct op_msrs *msrs; + cpu_notifier_register_begin(); + + /* Use get/put_online_cpus() to protect 'nmi_enabled' & 'ctr_running' */ get_online_cpus(); - unregister_cpu_notifier(&oprofile_cpu_nb); on_each_cpu(nmi_cpu_shutdown, NULL, 1); nmi_enabled = 0; ctr_running = 0; + __unregister_cpu_notifier(&oprofile_cpu_nb); put_online_cpus(); + + cpu_notifier_register_done(); + /* make variables visible to the nmi handler: */ smp_mb(); unregister_nmi_handler(NMI_LOCAL, "oprofile"); diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index e063eed0f91..5c6fc3577a4 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -13,9 +13,6 @@ obj-y += legacy.o irq.o obj-$(CONFIG_STA2X11) += sta2x11-fixup.o -obj-$(CONFIG_X86_VISWS) += visws.o - -obj-$(CONFIG_X86_NUMAQ) += numaq_32.o obj-$(CONFIG_X86_NUMACHIP) += numachip.o obj-$(CONFIG_X86_INTEL_MID) += intel_mid_pci.o diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 4f25ec07755..5075371ab59 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -218,9 +218,8 @@ static void teardown_mcfg_map(struct pci_root_info *info) } #endif -static acpi_status -resource_to_addr(struct acpi_resource *resource, - struct acpi_resource_address64 *addr) +static acpi_status resource_to_addr(struct acpi_resource *resource, + struct acpi_resource_address64 *addr) { acpi_status status; struct acpi_resource_memory24 *memory24; @@ -265,8 +264,7 @@ resource_to_addr(struct acpi_resource *resource, return AE_ERROR; } -static acpi_status -count_resource(struct acpi_resource *acpi_res, void *data) +static acpi_status count_resource(struct acpi_resource *acpi_res, void *data) { struct pci_root_info *info = data; struct acpi_resource_address64 addr; @@ -278,8 +276,7 @@ count_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; } -static acpi_status -setup_resource(struct acpi_resource *acpi_res, void *data) +static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data) { struct pci_root_info *info = data; struct resource *res; @@ -435,9 +432,9 @@ static void release_pci_root_info(struct pci_host_bridge *bridge) __release_pci_root_info(info); } -static void -probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, - int busnum, int domain) +static void probe_pci_root_info(struct pci_root_info *info, + struct acpi_device *device, + int busnum, int domain) { size_t size; @@ -473,16 +470,13 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) { struct acpi_device *device = root->device; - struct pci_root_info *info = NULL; + struct pci_root_info *info; int domain = root->segment; int busnum = root->secondary.start; LIST_HEAD(resources); - struct pci_bus *bus = NULL; + struct pci_bus *bus; struct pci_sysdata *sd; int node; -#ifdef CONFIG_ACPI_NUMA - int pxm; -#endif if (pci_ignore_seg) domain = 0; @@ -494,19 +488,16 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) return NULL; } - node = -1; -#ifdef CONFIG_ACPI_NUMA - pxm = acpi_get_pxm(device->handle); - if (pxm >= 0) - node = pxm_to_node(pxm); - if (node != -1) - set_mp_bus_to_node(busnum, node); - else -#endif - node = get_mp_bus_to_node(busnum); + node = acpi_get_node(device->handle); + if (node == NUMA_NO_NODE) { + node = x86_pci_root_bus_node(busnum); + if (node != 0 && node != NUMA_NO_NODE) + dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n", + node); + } - if (node != -1 && !node_online(node)) - node = -1; + if (node != NUMA_NO_NODE && !node_online(node)) + node = NUMA_NO_NODE; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) { @@ -519,15 +510,12 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) sd->domain = domain; sd->node = node; sd->companion = device; - /* - * Maybe the desired pci bus has been already scanned. In such case - * it is unnecessary to scan the pci bus with the given domain,busnum. - */ + bus = pci_find_bus(domain, busnum); if (bus) { /* - * If the desired bus exits, the content of bus->sysdata will - * be replaced by sd. + * If the desired bus has been scanned already, replace + * its bus->sysdata. */ memcpy(bus->sysdata, sd, sizeof(*sd)); kfree(info); @@ -572,15 +560,8 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) pcie_bus_configure_settings(child); } - if (bus && node != -1) { -#ifdef CONFIG_ACPI_NUMA - if (pxm >= 0) - dev_printk(KERN_DEBUG, &bus->dev, - "on NUMA node %d (pxm %d)\n", node, pxm); -#else + if (bus && node != NUMA_NO_NODE) dev_printk(KERN_DEBUG, &bus->dev, "on NUMA node %d\n", node); -#endif - } return bus; } diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index a48be98e9de..c20d2cc7ef6 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -11,27 +11,33 @@ #include "bus_numa.h" -/* - * This discovers the pcibus <-> node mapping on AMD K8. - * also get peer root bus resource for io,mmio - */ +#define AMD_NB_F0_NODE_ID 0x60 +#define AMD_NB_F0_UNIT_ID 0x64 +#define AMD_NB_F1_CONFIG_MAP_REG 0xe0 + +#define RANGE_NUM 16 +#define AMD_NB_F1_CONFIG_MAP_RANGES 4 -struct pci_hostbridge_probe { +struct amd_hostbridge { u32 bus; u32 slot; - u32 vendor; u32 device; }; -static struct pci_hostbridge_probe pci_probes[] __initdata = { - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1100 }, - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, - { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 }, +/* + * IMPORTANT NOTE: + * hb_probes[] and early_root_info_init() is in maintenance mode. + * It only supports K8, Fam10h, Fam11h, and Fam15h_00h-0fh . + * Future processor will rely on information in ACPI. + */ +static struct amd_hostbridge hb_probes[] __initdata = { + { 0, 0x18, 0x1100 }, /* K8 */ + { 0, 0x18, 0x1200 }, /* Family10h */ + { 0xff, 0, 0x1200 }, /* Family10h */ + { 0, 0x18, 0x1300 }, /* Family11h */ + { 0, 0x18, 0x1600 }, /* Family15h */ }; -#define RANGE_NUM 16 - static struct pci_root_info __init *find_pci_root_info(int node, int link) { struct pci_root_info *info; @@ -44,22 +50,13 @@ static struct pci_root_info __init *find_pci_root_info(int node, int link) return NULL; } -static void __init set_mp_bus_range_to_node(int min_bus, int max_bus, int node) -{ -#ifdef CONFIG_NUMA - int j; - - for (j = min_bus; j <= max_bus; j++) - set_mp_bus_to_node(j, node); -#endif -} /** - * early_fill_mp_bus_to_node() + * early_root_info_init() * called before pcibios_scan_root and pci_scan_bus - * fills the mp_bus_to_cpumask array based according to the LDT Bus Number - * Registers found in the K8 northbridge + * fills the mp_bus_to_cpumask array based according + * to the LDT Bus Number Registers found in the northbridge. */ -static int __init early_fill_mp_bus_info(void) +static int __init early_root_info_init(void) { int i; unsigned bus; @@ -84,19 +81,21 @@ static int __init early_fill_mp_bus_info(void) return -1; found = false; - for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { + for (i = 0; i < ARRAY_SIZE(hb_probes); i++) { u32 id; u16 device; u16 vendor; - bus = pci_probes[i].bus; - slot = pci_probes[i].slot; + bus = hb_probes[i].bus; + slot = hb_probes[i].slot; id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID); - vendor = id & 0xffff; device = (id>>16) & 0xffff; - if (pci_probes[i].vendor == vendor && - pci_probes[i].device == device) { + + if (vendor != PCI_VENDOR_ID_AMD) + continue; + + if (hb_probes[i].device == device) { found = true; break; } @@ -105,10 +104,16 @@ static int __init early_fill_mp_bus_info(void) if (!found) return 0; - for (i = 0; i < 4; i++) { + /* + * We should learn topology and routing information from _PXM and + * _CRS methods in the ACPI namespace. We extract node numbers + * here to work around BIOSes that don't supply _PXM. + */ + for (i = 0; i < AMD_NB_F1_CONFIG_MAP_RANGES; i++) { int min_bus; int max_bus; - reg = read_pci_config(bus, slot, 1, 0xe0 + (i << 2)); + reg = read_pci_config(bus, slot, 1, + AMD_NB_F1_CONFIG_MAP_REG + (i << 2)); /* Check if that register is enabled for bus range */ if ((reg & 7) != 3) @@ -117,16 +122,26 @@ static int __init early_fill_mp_bus_info(void) min_bus = (reg >> 16) & 0xff; max_bus = (reg >> 24) & 0xff; node = (reg >> 4) & 0x07; - set_mp_bus_range_to_node(min_bus, max_bus, node); link = (reg >> 8) & 0x03; info = alloc_pci_root_info(min_bus, max_bus, node, link); } + /* + * The following code extracts routing information for use on old + * systems where Linux doesn't automatically use host bridge _CRS + * methods (or when the user specifies "pci=nocrs"). + * + * We only do this through Fam11h, because _CRS should be enough on + * newer systems. + */ + if (boot_cpu_data.x86 > 0x11) + return 0; + /* get the default node and link for left over res */ - reg = read_pci_config(bus, slot, 0, 0x60); + reg = read_pci_config(bus, slot, 0, AMD_NB_F0_NODE_ID); def_node = (reg >> 8) & 0x07; - reg = read_pci_config(bus, slot, 0, 0x64); + reg = read_pci_config(bus, slot, 0, AMD_NB_F0_UNIT_ID); def_link = (reg >> 8) & 0x03; memset(range, 0, sizeof(range)); @@ -373,17 +388,20 @@ static int __init pci_io_ecs_init(void) int cpu; /* assume all cpus from fam10h have IO ECS */ - if (boot_cpu_data.x86 < 0x10) + if (boot_cpu_data.x86 < 0x10) return 0; /* Try the PCI method first. */ if (early_pci_allowed()) pci_enable_pci_io_ecs(); - register_cpu_notifier(&amd_cpu_notifier); + cpu_notifier_register_begin(); for_each_online_cpu(cpu) amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE, (void *)(long)cpu); + __register_cpu_notifier(&amd_cpu_notifier); + cpu_notifier_register_done(); + pci_probe |= PCI_HAS_IO_ECS; return 0; @@ -394,7 +412,7 @@ static int __init amd_postcore_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return 0; - early_fill_mp_bus_info(); + early_root_info_init(); pci_io_ecs_init(); return 0; diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c index 614392ced7d..bb461cfd01a 100644 --- a/arch/x86/pci/broadcom_bus.c +++ b/arch/x86/pci/broadcom_bus.c @@ -60,8 +60,8 @@ static void __init cnb20le_res(u8 bus, u8 slot, u8 func) word1 = read_pci_config_16(bus, slot, func, 0xc4); word2 = read_pci_config_16(bus, slot, func, 0xc6); if (word1 != word2) { - res.start = (word1 << 16) | 0x0000; - res.end = (word2 << 16) | 0xffff; + res.start = ((resource_size_t) word1 << 16) | 0x0000; + res.end = ((resource_size_t) word2 << 16) | 0xffff; res.flags = IORESOURCE_MEM | IORESOURCE_PREFETCH; update_res(info, res.start, res.end, res.flags, 0); } diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index c2735feb250..f3a2cfc1412 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -10,9 +10,6 @@ static struct pci_root_info *x86_find_pci_root_info(int bus) { struct pci_root_info *info; - if (list_empty(&pci_root_infos)) - return NULL; - list_for_each_entry(info, &pci_root_infos, list) if (info->busn.start == bus) return info; @@ -20,6 +17,16 @@ static struct pci_root_info *x86_find_pci_root_info(int bus) return NULL; } +int x86_pci_root_bus_node(int bus) +{ + struct pci_root_info *info = x86_find_pci_root_info(bus); + + if (!info) + return NUMA_NO_NODE; + + return info->node; +} + void x86_pci_root_bus_resources(int bus, struct list_head *resources) { struct pci_root_info *info = x86_find_pci_root_info(bus); diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 981c2dbd72c..059a76c2973 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -456,19 +456,25 @@ void __init dmi_check_pciprobe(void) dmi_check_system(pciprobe_dmi_table); } -struct pci_bus *pcibios_scan_root(int busnum) +void pcibios_scan_root(int busnum) { - struct pci_bus *bus = NULL; + struct pci_bus *bus; + struct pci_sysdata *sd; + LIST_HEAD(resources); - while ((bus = pci_find_next_bus(bus)) != NULL) { - if (bus->number == busnum) { - /* Already scanned */ - return bus; - } + sd = kzalloc(sizeof(*sd), GFP_KERNEL); + if (!sd) { + printk(KERN_ERR "PCI: OOM, skipping PCI bus %02x\n", busnum); + return; + } + sd->node = x86_pci_root_bus_node(busnum); + x86_pci_root_bus_resources(busnum, &resources); + printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum); + bus = pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources); + if (!bus) { + pci_free_resource_list(&resources); + kfree(sd); } - - return pci_scan_bus_on_node(busnum, &pci_root_ops, - get_mp_bus_to_node(busnum)); } void __init pcibios_set_cache_line_size(void) @@ -561,7 +567,6 @@ char * __init pcibios_setup(char *str) pci_probe |= PCI_PROBE_NOEARLY; return NULL; } -#ifndef CONFIG_X86_VISWS else if (!strcmp(str, "usepirqmask")) { pci_probe |= PCI_USE_PIRQ_MASK; return NULL; @@ -571,9 +576,7 @@ char * __init pcibios_setup(char *str) } else if (!strncmp(str, "lastbus=", 8)) { pcibios_last_bus = simple_strtol(str+8, NULL, 0); return NULL; - } -#endif - else if (!strcmp(str, "rom")) { + } else if (!strcmp(str, "rom")) { pci_probe |= PCI_ASSIGN_ROMS; return NULL; } else if (!strcmp(str, "norom")) { @@ -677,105 +680,3 @@ int pci_ext_cfg_avail(void) else return 0; } - -struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node) -{ - LIST_HEAD(resources); - struct pci_bus *bus = NULL; - struct pci_sysdata *sd; - - /* - * Allocate per-root-bus (not per bus) arch-specific data. - * TODO: leak; this memory is never freed. - * It's arguable whether it's worth the trouble to care. - */ - sd = kzalloc(sizeof(*sd), GFP_KERNEL); - if (!sd) { - printk(KERN_ERR "PCI: OOM, skipping PCI bus %02x\n", busno); - return NULL; - } - sd->node = node; - x86_pci_root_bus_resources(busno, &resources); - printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busno); - bus = pci_scan_root_bus(NULL, busno, ops, sd, &resources); - if (!bus) { - pci_free_resource_list(&resources); - kfree(sd); - } - - return bus; -} - -struct pci_bus *pci_scan_bus_with_sysdata(int busno) -{ - return pci_scan_bus_on_node(busno, &pci_root_ops, -1); -} - -/* - * NUMA info for PCI busses - * - * Early arch code is responsible for filling in reasonable values here. - * A node id of "-1" means "use current node". In other words, if a bus - * has a -1 node id, it's not tightly coupled to any particular chunk - * of memory (as is the case on some Nehalem systems). - */ -#ifdef CONFIG_NUMA - -#define BUS_NR 256 - -#ifdef CONFIG_X86_64 - -static int mp_bus_to_node[BUS_NR] = { - [0 ... BUS_NR - 1] = -1 -}; - -void set_mp_bus_to_node(int busnum, int node) -{ - if (busnum >= 0 && busnum < BUS_NR) - mp_bus_to_node[busnum] = node; -} - -int get_mp_bus_to_node(int busnum) -{ - int node = -1; - - if (busnum < 0 || busnum > (BUS_NR - 1)) - return node; - - node = mp_bus_to_node[busnum]; - - /* - * let numa_node_id to decide it later in dma_alloc_pages - * if there is no ram on that node - */ - if (node != -1 && !node_online(node)) - node = -1; - - return node; -} - -#else /* CONFIG_X86_32 */ - -static int mp_bus_to_node[BUS_NR] = { - [0 ... BUS_NR - 1] = -1 -}; - -void set_mp_bus_to_node(int busnum, int node) -{ - if (busnum >= 0 && busnum < BUS_NR) - mp_bus_to_node[busnum] = (unsigned char) node; -} - -int get_mp_bus_to_node(int busnum) -{ - int node; - - if (busnum < 0 || busnum > (BUS_NR - 1)) - return 0; - node = mp_bus_to_node[busnum]; - return node; -} - -#endif /* CONFIG_X86_32 */ - -#endif /* CONFIG_NUMA */ diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index bca9e85daaa..b5e60268d93 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -6,6 +6,7 @@ #include <linux/dmi.h> #include <linux/pci.h> #include <linux/vgaarb.h> +#include <asm/hpet.h> #include <asm/pci_x86.h> static void pci_fixup_i450nx(struct pci_dev *d) @@ -25,9 +26,9 @@ static void pci_fixup_i450nx(struct pci_dev *d) dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); if (busno) - pci_scan_bus_with_sysdata(busno); /* Bus A */ + pcibios_scan_root(busno); /* Bus A */ if (suba < subb) - pci_scan_bus_with_sysdata(suba+1); /* Bus B */ + pcibios_scan_root(suba+1); /* Bus B */ } pcibios_last_bus = -1; } @@ -42,7 +43,7 @@ static void pci_fixup_i450gx(struct pci_dev *d) u8 busno; pci_read_config_byte(d, 0x4a, &busno); dev_info(&d->dev, "i440KX/GX host bridge; secondary bus %02x\n", busno); - pci_scan_bus_with_sysdata(busno); + pcibios_scan_root(busno); pcibios_last_bus = -1; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82454GX, pci_fixup_i450gx); @@ -313,9 +314,10 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_r * IORESOURCE_ROM_SHADOW is used to associate the boot video * card with this copy. On laptops this copy has to be used since * the main ROM may be compressed or combined with another image. - * See pci_map_rom() for use of this flag. IORESOURCE_ROM_SHADOW - * is marked here since the boot video device will be the only enabled - * video device at this point. + * See pci_map_rom() for use of this flag. Before marking the device + * with IORESOURCE_ROM_SHADOW check if a vga_default_device is already set + * by either arch cde or vga-arbitration, if so only apply the fixup to this + * already determined primary video card. */ static void pci_fixup_video(struct pci_dev *pdev) @@ -336,9 +338,7 @@ static void pci_fixup_video(struct pci_dev *pdev) * type BRIDGE, or CARDBUS. Host to PCI controllers use * PCI header type NORMAL. */ - if (bridge - && ((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE) - || (bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) { + if (bridge && (pci_is_bridge(bridge))) { pci_read_config_word(bridge, PCI_BRIDGE_CONTROL, &config); if (!(config & PCI_BRIDGE_CTL_VGA)) @@ -346,12 +346,13 @@ static void pci_fixup_video(struct pci_dev *pdev) } bus = bus->parent; } - pci_read_config_word(pdev, PCI_COMMAND, &config); - if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) { - pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW; - dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n"); - if (!vga_default_device()) + if (!vga_default_device() || pdev == vga_default_device()) { + pci_read_config_word(pdev, PCI_COMMAND, &config); + if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) { + pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW; + dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n"); vga_set_default_device(pdev); + } } } DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID, @@ -524,6 +525,19 @@ static void sb600_disable_hpet_bar(struct pci_dev *dev) } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar); +#ifdef CONFIG_HPET_TIMER +static void sb600_hpet_quirk(struct pci_dev *dev) +{ + struct resource *r = &dev->resource[1]; + + if (r->flags & IORESOURCE_MEM && r->start == hpet_address) { + r->flags |= IORESOURCE_PCI_FIXED; + dev_info(&dev->dev, "reg 0x14 contains HPET; making it immovable\n"); + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, 0x4385, sb600_hpet_quirk); +#endif + /* * Twinhead H12Y needs us to block out a region otherwise we map devices * there and any access kills the box. diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index db6b1ab4325..a19ed92e74e 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -271,11 +271,16 @@ static void pcibios_allocate_dev_resources(struct pci_dev *dev, int pass) "BAR %d: reserving %pr (d=%d, p=%d)\n", idx, r, disabled, pass); if (pci_claim_resource(dev, idx) < 0) { - /* We'll assign a new address later */ - pcibios_save_fw_addr(dev, - idx, r->start); - r->end -= r->start; - r->start = 0; + if (r->flags & IORESOURCE_PCI_FIXED) { + dev_info(&dev->dev, "BAR %d %pR is immovable\n", + idx, r); + } else { + /* We'll assign a new address later */ + pcibios_save_fw_addr(dev, + idx, r->start); + r->end -= r->start; + r->start = 0; + } } } } @@ -356,6 +361,12 @@ static int __init pcibios_assign_resources(void) return 0; } +/** + * called in fs_initcall (one below subsys_initcall), + * give a chance for motherboard reserve resources + */ +fs_initcall(pcibios_assign_resources); + void pcibios_resource_survey_bus(struct pci_bus *bus) { dev_printk(KERN_DEBUG, &bus->dev, "Allocating resources\n"); @@ -392,12 +403,6 @@ void __init pcibios_resource_survey(void) ioapic_insert_resources(); } -/** - * called in fs_initcall (one below subsys_initcall), - * give a chance for motherboard reserve resources - */ -fs_initcall(pcibios_assign_resources); - static const struct vm_operations_struct pci_mmap_ops = { .access = generic_access_phys, }; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 372e9b8989b..84112f55dd7 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -136,13 +136,9 @@ static void __init pirq_peer_trick(void) busmap[e->bus] = 1; } for (i = 1; i < 256; i++) { - int node; if (!busmap[i] || pci_find_bus(0, i)) continue; - node = get_mp_bus_to_node(i); - if (pci_scan_bus_on_node(i, &pci_root_ops, node)) - printk(KERN_INFO "PCI: Discovered primary peer " - "bus %02x [IRQ]\n", i); + pcibios_scan_root(i); } pcibios_last_bus = -1; } diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 4db96fb1c23..5b662c0faf8 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -37,19 +37,17 @@ int __init pci_legacy_init(void) void pcibios_scan_specific_bus(int busn) { int devfn; - long node; u32 l; if (pci_find_bus(0, busn)) return; - node = get_mp_bus_to_node(busn); for (devfn = 0; devfn < 256; devfn += 8) { if (!raw_pci_read(0, busn, devfn, PCI_VENDOR_ID, 2, &l) && l != 0x0000 && l != 0xffff) { DBG("Found device at %02x:%02x [%04x]\n", busn, devfn, l); printk(KERN_INFO "PCI: Discovered peer bus %02x\n", busn); - pci_scan_bus_on_node(busn, &pci_root_ops, node); + pcibios_scan_root(busn); return; } } diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c deleted file mode 100644 index 72c229f9ebc..00000000000 --- a/arch/x86/pci/numaq_32.c +++ /dev/null @@ -1,165 +0,0 @@ -/* - * numaq_32.c - Low-level PCI access for NUMA-Q machines - */ - -#include <linux/pci.h> -#include <linux/init.h> -#include <linux/nodemask.h> -#include <asm/apic.h> -#include <asm/mpspec.h> -#include <asm/pci_x86.h> -#include <asm/numaq.h> - -#define BUS2QUAD(global) (mp_bus_id_to_node[global]) - -#define BUS2LOCAL(global) (mp_bus_id_to_local[global]) - -#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) - -#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ - (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3)) - -static void write_cf8(unsigned bus, unsigned devfn, unsigned reg) -{ - unsigned val = PCI_CONF1_MQ_ADDRESS(bus, devfn, reg); - if (xquad_portio) - writel(val, XQUAD_PORT_ADDR(0xcf8, BUS2QUAD(bus))); - else - outl(val, 0xCF8); -} - -static int pci_conf1_mq_read(unsigned int seg, unsigned int bus, - unsigned int devfn, int reg, int len, u32 *value) -{ - unsigned long flags; - void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus)); - - WARN_ON(seg); - if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) - return -EINVAL; - - raw_spin_lock_irqsave(&pci_config_lock, flags); - - write_cf8(bus, devfn, reg); - - switch (len) { - case 1: - if (xquad_portio) - *value = readb(adr + (reg & 3)); - else - *value = inb(0xCFC + (reg & 3)); - break; - case 2: - if (xquad_portio) - *value = readw(adr + (reg & 2)); - else - *value = inw(0xCFC + (reg & 2)); - break; - case 4: - if (xquad_portio) - *value = readl(adr); - else - *value = inl(0xCFC); - break; - } - - raw_spin_unlock_irqrestore(&pci_config_lock, flags); - - return 0; -} - -static int pci_conf1_mq_write(unsigned int seg, unsigned int bus, - unsigned int devfn, int reg, int len, u32 value) -{ - unsigned long flags; - void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus)); - - WARN_ON(seg); - if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) - return -EINVAL; - - raw_spin_lock_irqsave(&pci_config_lock, flags); - - write_cf8(bus, devfn, reg); - - switch (len) { - case 1: - if (xquad_portio) - writeb(value, adr + (reg & 3)); - else - outb((u8)value, 0xCFC + (reg & 3)); - break; - case 2: - if (xquad_portio) - writew(value, adr + (reg & 2)); - else - outw((u16)value, 0xCFC + (reg & 2)); - break; - case 4: - if (xquad_portio) - writel(value, adr + reg); - else - outl((u32)value, 0xCFC); - break; - } - - raw_spin_unlock_irqrestore(&pci_config_lock, flags); - - return 0; -} - -#undef PCI_CONF1_MQ_ADDRESS - -static const struct pci_raw_ops pci_direct_conf1_mq = { - .read = pci_conf1_mq_read, - .write = pci_conf1_mq_write -}; - - -static void pci_fixup_i450nx(struct pci_dev *d) -{ - /* - * i450NX -- Find and scan all secondary buses on all PXB's. - */ - int pxb, reg; - u8 busno, suba, subb; - int quad = BUS2QUAD(d->bus->number); - - dev_info(&d->dev, "searching for i450NX host bridges\n"); - reg = 0xd0; - for(pxb=0; pxb<2; pxb++) { - pci_read_config_byte(d, reg++, &busno); - pci_read_config_byte(d, reg++, &suba); - pci_read_config_byte(d, reg++, &subb); - dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n", - pxb, busno, suba, subb); - if (busno) { - /* Bus A */ - pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno)); - } - if (suba < subb) { - /* Bus B */ - pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, suba+1)); - } - } - pcibios_last_bus = -1; -} -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); - -int __init pci_numaq_init(void) -{ - int quad; - - raw_pci_ops = &pci_direct_conf1_mq; - - pcibios_scan_root(0); - if (num_online_nodes() > 1) - for_each_online_node(quad) { - if (quad == 0) - continue; - printk("Scanning PCI bus %d for quad %d\n", - QUADLOCAL2BUS(quad,0), quad); - pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, 0)); - } - return 0; -} diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c deleted file mode 100644 index 3e6d2a6db86..00000000000 --- a/arch/x86/pci/visws.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Low-Level PCI Support for SGI Visual Workstation - * - * (c) 1999--2000 Martin Mares <mj@ucw.cz> - */ - -#include <linux/kernel.h> -#include <linux/pci.h> -#include <linux/init.h> - -#include <asm/setup.h> -#include <asm/pci_x86.h> -#include <asm/visws/cobalt.h> -#include <asm/visws/lithium.h> - -static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; } -static void pci_visws_disable_irq(struct pci_dev *dev) { } - -/* int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq; */ -/* void (*pcibios_disable_irq)(struct pci_dev *dev) = &pci_visws_disable_irq; */ - -/* void __init pcibios_penalize_isa_irq(int irq, int active) {} */ - - -unsigned int pci_bus0, pci_bus1; - -static int __init visws_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) -{ - int irq, bus = dev->bus->number; - - pin--; - - /* Nothing useful at PIIX4 pin 1 */ - if (bus == pci_bus0 && slot == 4 && pin == 0) - return -1; - - /* PIIX4 USB is on Bus 0, Slot 4, Line 3 */ - if (bus == pci_bus0 && slot == 4 && pin == 3) { - irq = CO_IRQ(CO_APIC_PIIX4_USB); - goto out; - } - - /* First pin spread down 1 APIC entry per slot */ - if (pin == 0) { - irq = CO_IRQ((bus == pci_bus0 ? CO_APIC_PCIB_BASE0 : - CO_APIC_PCIA_BASE0) + slot); - goto out; - } - - /* lines 1,2,3 from any slot is shared in this twirly pattern */ - if (bus == pci_bus1) { - /* lines 1-3 from devices 0 1 rotate over 2 apic entries */ - irq = CO_IRQ(CO_APIC_PCIA_BASE123 + ((slot + (pin - 1)) % 2)); - } else { /* bus == pci_bus0 */ - /* lines 1-3 from devices 0-3 rotate over 3 apic entries */ - if (slot == 0) - slot = 3; /* same pattern */ - irq = CO_IRQ(CO_APIC_PCIA_BASE123 + ((3 - slot) + (pin - 1) % 3)); - } -out: - printk(KERN_DEBUG "PCI: Bus %d Slot %d Line %d -> IRQ %d\n", bus, slot, pin, irq); - return irq; -} - -int __init pci_visws_init(void) -{ - pcibios_enable_irq = &pci_visws_enable_irq; - pcibios_disable_irq = &pci_visws_disable_irq; - - /* The VISWS supports configuration access type 1 only */ - pci_probe = (pci_probe | PCI_PROBE_CONF1) & - ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2); - - pci_bus0 = li_pcib_read16(LI_PCI_BUSNUM) & 0xff; - pci_bus1 = li_pcia_read16(LI_PCI_BUSNUM) & 0xff; - - printk(KERN_INFO "PCI: Lithium bridge A bus: %u, " - "bridge B (PIIX4) bus: %u\n", pci_bus1, pci_bus0); - - raw_pci_ops = &pci_direct_conf1; - pci_scan_bus_with_sysdata(pci_bus0); - pci_scan_bus_with_sysdata(pci_bus1); - pci_fixup_irqs(pci_common_swizzle, visws_map_irq); - pcibios_resource_survey(); - /* Request bus scan */ - return 1; -} diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 103e702ec5a..905956f1646 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -178,6 +178,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) i = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], + (type == PCI_CAP_ID_MSI) ? nvec : 1, (type == PCI_CAP_ID_MSIX) ? "pcifront-msi-x" : "pcifront-msi", @@ -245,6 +246,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) "xen: msi already bound to pirq=%d\n", pirq); } irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, + (type == PCI_CAP_ID_MSI) ? nvec : 1, (type == PCI_CAP_ID_MSIX) ? "msi-x" : "msi", DOMID_SELF); @@ -269,9 +271,6 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int ret = 0; struct msi_desc *msidesc; - if (type == PCI_CAP_ID_MSI && nvec > 1) - return 1; - list_for_each_entry(msidesc, &dev->msi_list, list) { struct physdev_map_pirq map_irq; domid_t domid; @@ -291,7 +290,10 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) (pci_domain_nr(dev->bus) << 16); map_irq.devfn = dev->devfn; - if (type == PCI_CAP_ID_MSIX) { + if (type == PCI_CAP_ID_MSI && nvec > 1) { + map_irq.type = MAP_PIRQ_TYPE_MULTI_MSI; + map_irq.entry_nr = nvec; + } else if (type == PCI_CAP_ID_MSIX) { int pos; u32 table_offset, bir; @@ -308,6 +310,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (pci_seg_supported) ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); + if (type == PCI_CAP_ID_MSI && nvec > 1 && ret) { + /* + * If MAP_PIRQ_TYPE_MULTI_MSI is not available + * there's nothing else we can do in this case. + * Just set ret > 0 so driver can retry with + * single MSI. + */ + ret = 1; + goto out; + } if (ret == -EINVAL && !pci_domain_nr(dev->bus)) { map_irq.type = MAP_PIRQ_TYPE_MSI; map_irq.index = -1; @@ -324,11 +336,10 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) goto out; } - ret = xen_bind_pirq_msi_to_irq(dev, msidesc, - map_irq.pirq, - (type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi", - domid); + ret = xen_bind_pirq_msi_to_irq(dev, msidesc, map_irq.pirq, + (type == PCI_CAP_ID_MSI) ? nvec : 1, + (type == PCI_CAP_ID_MSIX) ? "msi-x" : "msi", + domid); if (ret < 0) goto out; } diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index 20342d4c82c..85afde1fa3e 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -9,5 +9,4 @@ obj-y += olpc/ obj-y += scx200/ obj-y += sfi/ obj-y += ts5500/ -obj-y += visws/ obj-y += uv/ diff --git a/arch/x86/platform/efi/early_printk.c b/arch/x86/platform/efi/early_printk.c index 81b506d5bef..52414211729 100644 --- a/arch/x86/platform/efi/early_printk.c +++ b/arch/x86/platform/efi/early_printk.c @@ -14,48 +14,92 @@ static const struct font_desc *font; static u32 efi_x, efi_y; +static void *efi_fb; +static bool early_efi_keep; -static __init void early_efi_clear_scanline(unsigned int y) +/* + * efi earlyprintk need use early_ioremap to map the framebuffer. + * But early_ioremap is not usable for earlyprintk=efi,keep, ioremap should + * be used instead. ioremap will be available after paging_init() which is + * earlier than initcall callbacks. Thus adding this early initcall function + * early_efi_map_fb to map the whole efi framebuffer. + */ +static __init int early_efi_map_fb(void) { - unsigned long base, *dst; - u16 len; + unsigned long base, size; + + if (!early_efi_keep) + return 0; base = boot_params.screen_info.lfb_base; - len = boot_params.screen_info.lfb_linelength; + size = boot_params.screen_info.lfb_size; + efi_fb = ioremap(base, size); + + return efi_fb ? 0 : -ENOMEM; +} +early_initcall(early_efi_map_fb); + +/* + * early_efi_map maps efi framebuffer region [start, start + len -1] + * In case earlyprintk=efi,keep we have the whole framebuffer mapped already + * so just return the offset efi_fb + start. + */ +static __init_refok void *early_efi_map(unsigned long start, unsigned long len) +{ + unsigned long base; + + base = boot_params.screen_info.lfb_base; + + if (efi_fb) + return (efi_fb + start); + else + return early_ioremap(base + start, len); +} - dst = early_ioremap(base + y*len, len); +static __init_refok void early_efi_unmap(void *addr, unsigned long len) +{ + if (!efi_fb) + early_iounmap(addr, len); +} + +static void early_efi_clear_scanline(unsigned int y) +{ + unsigned long *dst; + u16 len; + + len = boot_params.screen_info.lfb_linelength; + dst = early_efi_map(y*len, len); if (!dst) return; memset(dst, 0, len); - early_iounmap(dst, len); + early_efi_unmap(dst, len); } -static __init void early_efi_scroll_up(void) +static void early_efi_scroll_up(void) { - unsigned long base, *dst, *src; + unsigned long *dst, *src; u16 len; u32 i, height; - base = boot_params.screen_info.lfb_base; len = boot_params.screen_info.lfb_linelength; height = boot_params.screen_info.lfb_height; for (i = 0; i < height - font->height; i++) { - dst = early_ioremap(base + i*len, len); + dst = early_efi_map(i*len, len); if (!dst) return; - src = early_ioremap(base + (i + font->height) * len, len); + src = early_efi_map((i + font->height) * len, len); if (!src) { - early_iounmap(dst, len); + early_efi_unmap(dst, len); return; } memmove(dst, src, len); - early_iounmap(src, len); - early_iounmap(dst, len); + early_efi_unmap(src, len); + early_efi_unmap(dst, len); } } @@ -79,16 +123,14 @@ static void early_efi_write_char(u32 *dst, unsigned char c, unsigned int h) } } -static __init void +static void early_efi_write(struct console *con, const char *str, unsigned int num) { struct screen_info *si; - unsigned long base; unsigned int len; const char *s; void *dst; - base = boot_params.screen_info.lfb_base; si = &boot_params.screen_info; len = si->lfb_linelength; @@ -109,7 +151,7 @@ early_efi_write(struct console *con, const char *str, unsigned int num) for (h = 0; h < font->height; h++) { unsigned int n, x; - dst = early_ioremap(base + (efi_y + h) * len, len); + dst = early_efi_map((efi_y + h) * len, len); if (!dst) return; @@ -123,7 +165,7 @@ early_efi_write(struct console *con, const char *str, unsigned int num) s++; } - early_iounmap(dst, len); + early_efi_unmap(dst, len); } num -= count; @@ -179,6 +221,9 @@ static __init int early_efi_setup(struct console *con, char *options) for (i = 0; i < (yres - efi_y) / font->height; i++) early_efi_scroll_up(); + /* early_console_register will unset CON_BOOT in case ,keep */ + if (!(con->flags & CON_BOOT)) + early_efi_keep = true; return 0; } diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index ff0174dda81..a9acde72d4e 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -75,7 +75,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state) return 0; } -asmlinkage int xo1_do_sleep(u8 sleep_state) +asmlinkage __visible int xo1_do_sleep(u8 sleep_state) { void *pgd_addr = __va(read_cr3()); diff --git a/arch/x86/platform/visws/Makefile b/arch/x86/platform/visws/Makefile deleted file mode 100644 index 91bc17ab2fd..00000000000 --- a/arch/x86/platform/visws/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-$(CONFIG_X86_VISWS) += visws_quirks.o diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c deleted file mode 100644 index 94d8a39332e..00000000000 --- a/arch/x86/platform/visws/visws_quirks.c +++ /dev/null @@ -1,608 +0,0 @@ -/* - * SGI Visual Workstation support and quirks, unmaintained. - * - * Split out from setup.c by davej@suse.de - * - * Copyright (C) 1999 Bent Hagemark, Ingo Molnar - * - * SGI Visual Workstation interrupt controller - * - * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC - * which serves as the main interrupt controller in the system. Non-legacy - * hardware in the system uses this controller directly. Legacy devices - * are connected to the PIIX4 which in turn has its 8259(s) connected to - * a of the Cobalt APIC entry. - * - * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com - * - * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru> - */ -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/visws/cobalt.h> -#include <asm/visws/piix4.h> -#include <asm/io_apic.h> -#include <asm/fixmap.h> -#include <asm/reboot.h> -#include <asm/setup.h> -#include <asm/apic.h> -#include <asm/e820.h> -#include <asm/time.h> -#include <asm/io.h> - -#include <linux/kernel_stat.h> - -#include <asm/i8259.h> -#include <asm/irq_vectors.h> -#include <asm/visws/lithium.h> - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/pci.h> -#include <linux/pci_ids.h> - -extern int no_broadcast; - -char visws_board_type = -1; -char visws_board_rev = -1; - -static void __init visws_time_init(void) -{ - printk(KERN_INFO "Starting Cobalt Timer system clock\n"); - - /* Set the countdown value */ - co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ); - - /* Start the timer */ - co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN); - - /* Enable (unmask) the timer interrupt */ - co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); - - setup_default_timer_irq(); -} - -/* Replaces the default init_ISA_irqs in the generic setup */ -static void __init visws_pre_intr_init(void); - -/* Quirk for machine specific memory setup. */ - -#define MB (1024 * 1024) - -unsigned long sgivwfb_mem_phys; -unsigned long sgivwfb_mem_size; -EXPORT_SYMBOL(sgivwfb_mem_phys); -EXPORT_SYMBOL(sgivwfb_mem_size); - -long long mem_size __initdata = 0; - -static char * __init visws_memory_setup(void) -{ - long long gfx_mem_size = 8 * MB; - - mem_size = boot_params.alt_mem_k; - - if (!mem_size) { - printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n"); - mem_size = 128 * MB; - } - - /* - * this hardcodes the graphics memory to 8 MB - * it really should be sized dynamically (or at least - * set as a boot param) - */ - if (!sgivwfb_mem_size) { - printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n"); - sgivwfb_mem_size = 8 * MB; - } - - /* - * Trim to nearest MB - */ - sgivwfb_mem_size &= ~((1 << 20) - 1); - sgivwfb_mem_phys = mem_size - gfx_mem_size; - - e820_add_region(0, LOWMEMSIZE(), E820_RAM); - e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM); - e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED); - - return "PROM"; -} - -static void visws_machine_emergency_restart(void) -{ - /* - * Visual Workstations restart after this - * register is poked on the PIIX4 - */ - outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT); -} - -static void visws_machine_power_off(void) -{ - unsigned short pm_status; -/* extern unsigned int pci_bus0; */ - - while ((pm_status = inw(PMSTS_PORT)) & 0x100) - outw(pm_status, PMSTS_PORT); - - outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT); - - mdelay(10); - -#define PCI_CONF1_ADDRESS(bus, devfn, reg) \ - (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3)) - -/* outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */ - outl(PIIX_SPECIAL_STOP, 0xCFC); -} - -static void __init visws_get_smp_config(unsigned int early) -{ -} - -/* - * The Visual Workstation is Intel MP compliant in the hardware - * sense, but it doesn't have a BIOS(-configuration table). - * No problem for Linux. - */ - -static void __init MP_processor_info(struct mpc_cpu *m) -{ - int ver, logical_apicid; - physid_mask_t apic_cpus; - - if (!(m->cpuflag & CPU_ENABLED)) - return; - - logical_apicid = m->apicid; - printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n", - m->cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "", - m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8, - (m->cpufeature & CPU_MODEL_MASK) >> 4, m->apicver); - - if (m->cpuflag & CPU_BOOTPROCESSOR) - boot_cpu_physical_apicid = m->apicid; - - ver = m->apicver; - if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) { - printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", - m->apicid, MAX_LOCAL_APIC); - return; - } - - apic->apicid_to_cpu_present(m->apicid, &apic_cpus); - physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); - /* - * Validate version - */ - if (ver == 0x0) { - printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - m->apicid); - ver = 0x10; - } - apic_version[m->apicid] = ver; -} - -static void __init visws_find_smp_config(void) -{ - struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); - unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); - - if (ncpus > CO_CPU_MAX) { - printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n", - ncpus, mp); - - ncpus = CO_CPU_MAX; - } - - if (ncpus > setup_max_cpus) - ncpus = setup_max_cpus; - -#ifdef CONFIG_X86_LOCAL_APIC - smp_found_config = 1; -#endif - while (ncpus--) - MP_processor_info(mp++); - - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; -} - -static void visws_trap_init(void); - -void __init visws_early_detect(void) -{ - int raw; - - visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG) - >> PIIX_GPI_BD_SHIFT; - - if (visws_board_type < 0) - return; - - /* - * Override the default platform setup functions - */ - x86_init.resources.memory_setup = visws_memory_setup; - x86_init.mpparse.get_smp_config = visws_get_smp_config; - x86_init.mpparse.find_smp_config = visws_find_smp_config; - x86_init.irqs.pre_vector_init = visws_pre_intr_init; - x86_init.irqs.trap_init = visws_trap_init; - x86_init.timers.timer_init = visws_time_init; - x86_init.pci.init = pci_visws_init; - x86_init.pci.init_irq = x86_init_noop; - - /* - * Install reboot quirks: - */ - pm_power_off = visws_machine_power_off; - machine_ops.emergency_restart = visws_machine_emergency_restart; - - /* - * Do not use broadcast IPIs: - */ - no_broadcast = 0; - -#ifdef CONFIG_X86_IO_APIC - /* - * Turn off IO-APIC detection and initialization: - */ - skip_ioapic_setup = 1; -#endif - - /* - * Get Board rev. - * First, we have to initialize the 307 part to allow us access - * to the GPIO registers. Let's map them at 0x0fc0 which is right - * after the PIIX4 PM section. - */ - outb_p(SIO_DEV_SEL, SIO_INDEX); - outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */ - - outb_p(SIO_DEV_MSB, SIO_INDEX); - outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */ - - outb_p(SIO_DEV_LSB, SIO_INDEX); - outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */ - - outb_p(SIO_DEV_ENB, SIO_INDEX); - outb_p(1, SIO_DATA); /* Enable GPIO registers. */ - - /* - * Now, we have to map the power management section to write - * a bit which enables access to the GPIO registers. - * What lunatic came up with this shit? - */ - outb_p(SIO_DEV_SEL, SIO_INDEX); - outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */ - - outb_p(SIO_DEV_MSB, SIO_INDEX); - outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */ - - outb_p(SIO_DEV_LSB, SIO_INDEX); - outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */ - - outb_p(SIO_DEV_ENB, SIO_INDEX); - outb_p(1, SIO_DATA); /* Enable PM registers. */ - - /* - * Now, write the PM register which enables the GPIO registers. - */ - outb_p(SIO_PM_FER2, SIO_PM_INDEX); - outb_p(SIO_PM_GP_EN, SIO_PM_DATA); - - /* - * Now, initialize the GPIO registers. - * We want them all to be inputs which is the - * power on default, so let's leave them alone. - * So, let's just read the board rev! - */ - raw = inb_p(SIO_GP_DATA1); - raw &= 0x7f; /* 7 bits of valid board revision ID. */ - - if (visws_board_type == VISWS_320) { - if (raw < 0x6) { - visws_board_rev = 4; - } else if (raw < 0xc) { - visws_board_rev = 5; - } else { - visws_board_rev = 6; - } - } else if (visws_board_type == VISWS_540) { - visws_board_rev = 2; - } else { - visws_board_rev = raw; - } - - printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n", - (visws_board_type == VISWS_320 ? "320" : - (visws_board_type == VISWS_540 ? "540" : - "unknown")), visws_board_rev); -} - -#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4) -#define BCD (LI_INTB | LI_INTC | LI_INTD) -#define ALLDEVS (A01234 | BCD) - -static __init void lithium_init(void) -{ - set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS); - set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS); - - if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || - (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) { - printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A'); -/* panic("This machine is not SGI Visual Workstation 320/540"); */ - } - - if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || - (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) { - printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B'); -/* panic("This machine is not SGI Visual Workstation 320/540"); */ - } - - li_pcia_write16(LI_PCI_INTEN, ALLDEVS); - li_pcib_write16(LI_PCI_INTEN, ALLDEVS); -} - -static __init void cobalt_init(void) -{ - /* - * On normal SMP PC this is used only with SMP, but we have to - * use it and set it up here to start the Cobalt clock - */ - set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE); - setup_local_APIC(); - printk(KERN_INFO "Local APIC Version %#x, ID %#x\n", - (unsigned int)apic_read(APIC_LVR), - (unsigned int)apic_read(APIC_ID)); - - set_fixmap(FIX_CO_CPU, CO_CPU_PHYS); - set_fixmap(FIX_CO_APIC, CO_APIC_PHYS); - printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n", - co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID)); - - /* Enable Cobalt APIC being careful to NOT change the ID! */ - co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE); - - printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n", - co_apic_read(CO_APIC_ID)); -} - -static void __init visws_trap_init(void) -{ - lithium_init(); - cobalt_init(); -} - -/* - * IRQ controller / APIC support: - */ - -static DEFINE_SPINLOCK(cobalt_lock); - -/* - * Set the given Cobalt APIC Redirection Table entry to point - * to the given IDT vector/index. - */ -static inline void co_apic_set(int entry, int irq) -{ - co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR)); - co_apic_write(CO_APIC_HI(entry), 0); -} - -/* - * Cobalt (IO)-APIC functions to handle PCI devices. - */ -static inline int co_apic_ide0_hack(void) -{ - extern char visws_board_type; - extern char visws_board_rev; - - if (visws_board_type == VISWS_320 && visws_board_rev == 5) - return 5; - return CO_APIC_IDE0; -} - -static int is_co_apic(unsigned int irq) -{ - if (IS_CO_APIC(irq)) - return CO_APIC(irq); - - switch (irq) { - case 0: return CO_APIC_CPU; - case CO_IRQ_IDE0: return co_apic_ide0_hack(); - case CO_IRQ_IDE1: return CO_APIC_IDE1; - default: return -1; - } -} - - -/* - * This is the SGI Cobalt (IO-)APIC: - */ -static void enable_cobalt_irq(struct irq_data *data) -{ - co_apic_set(is_co_apic(data->irq), data->irq); -} - -static void disable_cobalt_irq(struct irq_data *data) -{ - int entry = is_co_apic(data->irq); - - co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK); - co_apic_read(CO_APIC_LO(entry)); -} - -static void ack_cobalt_irq(struct irq_data *data) -{ - unsigned long flags; - - spin_lock_irqsave(&cobalt_lock, flags); - disable_cobalt_irq(data); - apic_write(APIC_EOI, APIC_EOI_ACK); - spin_unlock_irqrestore(&cobalt_lock, flags); -} - -static struct irq_chip cobalt_irq_type = { - .name = "Cobalt-APIC", - .irq_enable = enable_cobalt_irq, - .irq_disable = disable_cobalt_irq, - .irq_ack = ack_cobalt_irq, -}; - - -/* - * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt - * -- not the manner expected by the code in i8259.c. - * - * there is a 'master' physical interrupt source that gets sent to - * the CPU. But in the chipset there are various 'virtual' interrupts - * waiting to be handled. We represent this to Linux through a 'master' - * interrupt controller type, and through a special virtual interrupt- - * controller. Device drivers only see the virtual interrupt sources. - */ -static unsigned int startup_piix4_master_irq(struct irq_data *data) -{ - legacy_pic->init(0); - enable_cobalt_irq(data); - return 0; -} - -static struct irq_chip piix4_master_irq_type = { - .name = "PIIX4-master", - .irq_startup = startup_piix4_master_irq, - .irq_ack = ack_cobalt_irq, -}; - -static void pii4_mask(struct irq_data *data) { } - -static struct irq_chip piix4_virtual_irq_type = { - .name = "PIIX4-virtual", - .irq_mask = pii4_mask, -}; - -/* - * PIIX4-8259 master/virtual functions to handle interrupt requests - * from legacy devices: floppy, parallel, serial, rtc. - * - * None of these get Cobalt APIC entries, neither do they have IDT - * entries. These interrupts are purely virtual and distributed from - * the 'master' interrupt source: CO_IRQ_8259. - * - * When the 8259 interrupts its handler figures out which of these - * devices is interrupting and dispatches to its handler. - * - * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/ - * enable_irq gets the right irq. This 'master' irq is never directly - * manipulated by any driver. - */ -static irqreturn_t piix4_master_intr(int irq, void *dev_id) -{ - unsigned long flags; - int realirq; - - raw_spin_lock_irqsave(&i8259A_lock, flags); - - /* Find out what's interrupting in the PIIX4 master 8259 */ - outb(0x0c, 0x20); /* OCW3 Poll command */ - realirq = inb(0x20); - - /* - * Bit 7 == 0 means invalid/spurious - */ - if (unlikely(!(realirq & 0x80))) - goto out_unlock; - - realirq &= 7; - - if (unlikely(realirq == 2)) { - outb(0x0c, 0xa0); - realirq = inb(0xa0); - - if (unlikely(!(realirq & 0x80))) - goto out_unlock; - - realirq = (realirq & 7) + 8; - } - - /* mask and ack interrupt */ - cached_irq_mask |= 1 << realirq; - if (unlikely(realirq > 7)) { - inb(0xa1); - outb(cached_slave_mask, 0xa1); - outb(0x60 + (realirq & 7), 0xa0); - outb(0x60 + 2, 0x20); - } else { - inb(0x21); - outb(cached_master_mask, 0x21); - outb(0x60 + realirq, 0x20); - } - - raw_spin_unlock_irqrestore(&i8259A_lock, flags); - - /* - * handle this 'virtual interrupt' as a Cobalt one now. - */ - generic_handle_irq(realirq); - - return IRQ_HANDLED; - -out_unlock: - raw_spin_unlock_irqrestore(&i8259A_lock, flags); - return IRQ_NONE; -} - -static struct irqaction master_action = { - .handler = piix4_master_intr, - .name = "PIIX4-8259", - .flags = IRQF_NO_THREAD, -}; - -static struct irqaction cascade_action = { - .handler = no_action, - .name = "cascade", - .flags = IRQF_NO_THREAD, -}; - -static inline void set_piix4_virtual_irq_type(void) -{ - piix4_virtual_irq_type.irq_enable = i8259A_chip.irq_unmask; - piix4_virtual_irq_type.irq_disable = i8259A_chip.irq_mask; - piix4_virtual_irq_type.irq_unmask = i8259A_chip.irq_unmask; -} - -static void __init visws_pre_intr_init(void) -{ - int i; - - set_piix4_virtual_irq_type(); - - for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { - struct irq_chip *chip = NULL; - - if (i == 0) - chip = &cobalt_irq_type; - else if (i == CO_IRQ_IDE0) - chip = &cobalt_irq_type; - else if (i == CO_IRQ_IDE1) - chip = &cobalt_irq_type; - else if (i == CO_IRQ_8259) - chip = &piix4_master_irq_type; - else if (i < CO_IRQ_APIC0) - chip = &piix4_virtual_irq_type; - else if (IS_CO_APIC(i)) - chip = &cobalt_irq_type; - - if (chip) - irq_set_chip(i, chip); - } - - setup_irq(CO_IRQ_8259, &master_action); - setup_irq(2, &cascade_action); -} diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 304fca20d96..35e2bb6c0f3 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -23,7 +23,7 @@ extern __visible const void __nosave_begin, __nosave_end; /* Defined in hibernate_asm_64.S */ -extern asmlinkage int restore_image(void); +extern asmlinkage __visible int restore_image(void); /* * Address to jump to in the last phase of restore in order to get to the image diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 3497f14e4de..7c0d7be176a 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -52,8 +52,9 @@ $(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE OBJCOPYFLAGS_realmode.bin := -O binary targets += realmode.bin -$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs +$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs FORCE $(call if_changed,objcopy) + @: quiet_cmd_relocs = RELOCS $@ cmd_relocs = arch/x86/tools/relocs --realmode $< > $@ diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile index f325af26107..3323c274524 100644 --- a/arch/x86/syscalls/Makefile +++ b/arch/x86/syscalls/Makefile @@ -54,5 +54,7 @@ syshdr-$(CONFIG_X86_64) += syscalls_64.h targets += $(uapisyshdr-y) $(syshdr-y) +PHONY += all all: $(addprefix $(uapi)/,$(uapisyshdr-y)) all: $(addprefix $(out)/,$(syshdr-y)) + @: diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 96bc506ac6d..d6b86792161 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -359,3 +359,4 @@ 350 i386 finit_module sys_finit_module 351 i386 sched_setattr sys_sched_setattr 352 i386 sched_getattr sys_sched_getattr +353 i386 renameat2 sys_renameat2 diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index a12bddc7cce..04376ac3d9e 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -322,6 +322,7 @@ 313 common finit_module sys_finit_module 314 common sched_setattr sys_sched_setattr 315 common sched_getattr sys_sched_getattr +316 common renameat2 sys_renameat2 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index e8120346903..604a37efd4d 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -40,4 +40,6 @@ $(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/ina HOST_EXTRACFLAGS += -I$(srctree)/tools/include hostprogs-y += relocs relocs-objs := relocs_32.o relocs_64.o relocs_common.o +PHONY += relocs relocs: $(obj)/relocs + @: diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index cfbdbdb4e17..bbb1d2259ec 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -69,8 +69,8 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "__per_cpu_load|" "init_per_cpu__.*|" "__end_rodata_hpage_align|" - "__vvar_page|" #endif + "__vvar_page|" "_end)$" }; diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 9206ac7961a..c580d1210ff 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -23,7 +23,8 @@ vobjs-$(VDSOX32-y) += $(vobjx32s-compat) vobj64s := $(filter-out $(vobjx32s-compat),$(vobjs-y)) # files to link into kernel -obj-$(VDSO64-y) += vma.o vdso.o +obj-y += vma.o +obj-$(VDSO64-y) += vdso.o obj-$(VDSOX32-y) += vdsox32.o obj-$(VDSO32-y) += vdso32.o vdso32-setup.o @@ -138,7 +139,7 @@ override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ targets += vdso32/vdso32.lds targets += $(vdso32-images) $(vdso32-images:=.dbg) -targets += vdso32/note.o $(vdso32.so-y:%=vdso32/%.o) +targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) extra-y += $(vdso32-images) @@ -148,8 +149,19 @@ KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) $(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) $(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32 +KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic +KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) +KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) +KBUILD_CFLAGS_32 += -fno-omit-frame-pointer +$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) + $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ $(obj)/vdso32/vdso32.lds \ + $(obj)/vdso32/vclock_gettime.o \ $(obj)/vdso32/note.o \ $(obj)/vdso32/%.o $(call if_changed,vdso) diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index eb5d7a56f8d..16d686171e9 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -4,6 +4,9 @@ * * Fast user context implementation of clock_gettime, gettimeofday, and time. * + * 32 Bit compat layer by Stefani Seibold <stefani@seibold.net> + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + * * The code should have no internal unresolved relocations. * Check with readelf after changing. */ @@ -11,56 +14,55 @@ /* Disable profiling for userspace code: */ #define DISABLE_BRANCH_PROFILING -#include <linux/kernel.h> -#include <linux/posix-timers.h> -#include <linux/time.h> -#include <linux/string.h> -#include <asm/vsyscall.h> -#include <asm/fixmap.h> +#include <uapi/linux/time.h> #include <asm/vgtod.h> -#include <asm/timex.h> #include <asm/hpet.h> +#include <asm/vvar.h> #include <asm/unistd.h> -#include <asm/io.h> -#include <asm/pvclock.h> +#include <asm/msr.h> +#include <linux/math64.h> +#include <linux/time.h> #define gtod (&VVAR(vsyscall_gtod_data)) -notrace static cycle_t vread_tsc(void) +extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts); +extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); +extern time_t __vdso_time(time_t *t); + +#ifdef CONFIG_HPET_TIMER +static inline u32 read_hpet_counter(const volatile void *addr) { - cycle_t ret; - u64 last; + return *(const volatile u32 *) (addr + HPET_COUNTER); +} +#endif - /* - * Empirically, a fence (of type that depends on the CPU) - * before rdtsc is enough to ensure that rdtsc is ordered - * with respect to loads. The various CPU manuals are unclear - * as to whether rdtsc can be reordered with later loads, - * but no one has ever seen it happen. - */ - rdtsc_barrier(); - ret = (cycle_t)vget_cycles(); +#ifndef BUILD_VDSO32 - last = VVAR(vsyscall_gtod_data).clock.cycle_last; +#include <linux/kernel.h> +#include <asm/vsyscall.h> +#include <asm/fixmap.h> +#include <asm/pvclock.h> - if (likely(ret >= last)) - return ret; +static notrace cycle_t vread_hpet(void) +{ + return read_hpet_counter((const void *)fix_to_virt(VSYSCALL_HPET)); +} - /* - * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a funciton of time and the likely is - * very likely) and there's a data dependence, so force GCC - * to generate a branch instead. I don't barrier() because - * we don't actually need a barrier, and if this function - * ever gets inlined it will generate worse code. - */ - asm volatile (""); - return last; +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + asm("syscall" : "=a" (ret) : + "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); + return ret; } -static notrace cycle_t vread_hpet(void) +notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) { - return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + HPET_COUNTER); + long ret; + + asm("syscall" : "=a" (ret) : + "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + return ret; } #ifdef CONFIG_PARAVIRT_CLOCK @@ -124,7 +126,7 @@ static notrace cycle_t vread_pvclock(int *mode) *mode = VCLOCK_NONE; /* refer to tsc.c read_tsc() comment for rationale */ - last = VVAR(vsyscall_gtod_data).clock.cycle_last; + last = gtod->cycle_last; if (likely(ret >= last)) return ret; @@ -133,11 +135,30 @@ static notrace cycle_t vread_pvclock(int *mode) } #endif +#else + +extern u8 hpet_page + __attribute__((visibility("hidden"))); + +#ifdef CONFIG_HPET_TIMER +static notrace cycle_t vread_hpet(void) +{ + return read_hpet_counter((const void *)(&hpet_page)); +} +#endif + notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory"); + + asm( + "mov %%ebx, %%edx \n" + "mov %2, %%ebx \n" + "call VDSO32_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret) + : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) + : "memory", "edx"); return ret; } @@ -145,28 +166,79 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + asm( + "mov %%ebx, %%edx \n" + "mov %2, %%ebx \n" + "call VDSO32_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret) + : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) + : "memory", "edx"); return ret; } +#ifdef CONFIG_PARAVIRT_CLOCK + +static notrace cycle_t vread_pvclock(int *mode) +{ + *mode = VCLOCK_NONE; + return 0; +} +#endif + +#endif + +notrace static cycle_t vread_tsc(void) +{ + cycle_t ret; + u64 last; + + /* + * Empirically, a fence (of type that depends on the CPU) + * before rdtsc is enough to ensure that rdtsc is ordered + * with respect to loads. The various CPU manuals are unclear + * as to whether rdtsc can be reordered with later loads, + * but no one has ever seen it happen. + */ + rdtsc_barrier(); + ret = (cycle_t)__native_read_tsc(); + + last = gtod->cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a funciton of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; +} notrace static inline u64 vgetsns(int *mode) { - long v; + u64 v; cycles_t cycles; - if (gtod->clock.vclock_mode == VCLOCK_TSC) + + if (gtod->vclock_mode == VCLOCK_TSC) cycles = vread_tsc(); - else if (gtod->clock.vclock_mode == VCLOCK_HPET) +#ifdef CONFIG_HPET_TIMER + else if (gtod->vclock_mode == VCLOCK_HPET) cycles = vread_hpet(); +#endif #ifdef CONFIG_PARAVIRT_CLOCK - else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK) + else if (gtod->vclock_mode == VCLOCK_PVCLOCK) cycles = vread_pvclock(mode); #endif else return 0; - v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; - return v * gtod->clock.mult; + v = (cycles - gtod->cycle_last) & gtod->mask; + return v * gtod->mult; } /* Code size doesn't matter (vdso is 4k anyway) and this is faster. */ @@ -176,106 +248,102 @@ notrace static int __always_inline do_realtime(struct timespec *ts) u64 ns; int mode; - ts->tv_nsec = 0; do { - seq = raw_read_seqcount_begin(>od->seq); - mode = gtod->clock.vclock_mode; + seq = gtod_read_begin(gtod); + mode = gtod->vclock_mode; ts->tv_sec = gtod->wall_time_sec; ns = gtod->wall_time_snsec; ns += vgetsns(&mode); - ns >>= gtod->clock.shift; - } while (unlikely(read_seqcount_retry(>od->seq, seq))); + ns >>= gtod->shift; + } while (unlikely(gtod_read_retry(gtod, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; - timespec_add_ns(ts, ns); return mode; } -notrace static int do_monotonic(struct timespec *ts) +notrace static int __always_inline do_monotonic(struct timespec *ts) { unsigned long seq; u64 ns; int mode; - ts->tv_nsec = 0; do { - seq = raw_read_seqcount_begin(>od->seq); - mode = gtod->clock.vclock_mode; + seq = gtod_read_begin(gtod); + mode = gtod->vclock_mode; ts->tv_sec = gtod->monotonic_time_sec; ns = gtod->monotonic_time_snsec; ns += vgetsns(&mode); - ns >>= gtod->clock.shift; - } while (unlikely(read_seqcount_retry(>od->seq, seq))); - timespec_add_ns(ts, ns); + ns >>= gtod->shift; + } while (unlikely(gtod_read_retry(gtod, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; return mode; } -notrace static int do_realtime_coarse(struct timespec *ts) +notrace static void do_realtime_coarse(struct timespec *ts) { unsigned long seq; do { - seq = raw_read_seqcount_begin(>od->seq); - ts->tv_sec = gtod->wall_time_coarse.tv_sec; - ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; - } while (unlikely(read_seqcount_retry(>od->seq, seq))); - return 0; + seq = gtod_read_begin(gtod); + ts->tv_sec = gtod->wall_time_coarse_sec; + ts->tv_nsec = gtod->wall_time_coarse_nsec; + } while (unlikely(gtod_read_retry(gtod, seq))); } -notrace static int do_monotonic_coarse(struct timespec *ts) +notrace static void do_monotonic_coarse(struct timespec *ts) { unsigned long seq; do { - seq = raw_read_seqcount_begin(>od->seq); - ts->tv_sec = gtod->monotonic_time_coarse.tv_sec; - ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec; - } while (unlikely(read_seqcount_retry(>od->seq, seq))); - - return 0; + seq = gtod_read_begin(gtod); + ts->tv_sec = gtod->monotonic_time_coarse_sec; + ts->tv_nsec = gtod->monotonic_time_coarse_nsec; + } while (unlikely(gtod_read_retry(gtod, seq))); } notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { - int ret = VCLOCK_NONE; - switch (clock) { case CLOCK_REALTIME: - ret = do_realtime(ts); + if (do_realtime(ts) == VCLOCK_NONE) + goto fallback; break; case CLOCK_MONOTONIC: - ret = do_monotonic(ts); + if (do_monotonic(ts) == VCLOCK_NONE) + goto fallback; break; case CLOCK_REALTIME_COARSE: - return do_realtime_coarse(ts); + do_realtime_coarse(ts); + break; case CLOCK_MONOTONIC_COARSE: - return do_monotonic_coarse(ts); + do_monotonic_coarse(ts); + break; + default: + goto fallback; } - if (ret == VCLOCK_NONE) - return vdso_fallback_gettime(clock, ts); return 0; +fallback: + return vdso_fallback_gettime(clock, ts); } int clock_gettime(clockid_t, struct timespec *) __attribute__((weak, alias("__vdso_clock_gettime"))); notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { - long ret = VCLOCK_NONE; - if (likely(tv != NULL)) { - BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != - offsetof(struct timespec, tv_nsec) || - sizeof(*tv) != sizeof(struct timespec)); - ret = do_realtime((struct timespec *)tv); + if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE)) + return vdso_fallback_gtod(tv, tz); tv->tv_usec /= 1000; } if (unlikely(tz != NULL)) { - /* Avoid memcpy. Some old compilers fail to inline it */ - tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest; - tz->tz_dsttime = gtod->sys_tz.tz_dsttime; + tz->tz_minuteswest = gtod->tz_minuteswest; + tz->tz_dsttime = gtod->tz_dsttime; } - if (ret == VCLOCK_NONE) - return vdso_fallback_gtod(tv, tz); return 0; } int gettimeofday(struct timeval *, struct timezone *) @@ -287,8 +355,8 @@ int gettimeofday(struct timeval *, struct timezone *) */ notrace time_t __vdso_time(time_t *t) { - /* This is atomic on x86_64 so we don't need any locks. */ - time_t result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec); + /* This is atomic on x86 so we don't need any locks. */ + time_t result = ACCESS_ONCE(gtod->wall_time_sec); if (t) *t = result; diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S index 634a2cf6204..9df017ab228 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -6,7 +6,21 @@ SECTIONS { - . = VDSO_PRELINK + SIZEOF_HEADERS; +#ifdef BUILD_VDSO32 +#include <asm/vdso32.h> + + hpet_page = . - VDSO_OFFSET(VDSO_HPET_PAGE); + + vvar = . - VDSO_OFFSET(VDSO_VVAR_PAGE); + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) vvar_ ## name = vvar + offset; +#define __VVAR_KERNEL_LDS +#include <asm/vvar.h> +#undef __VVAR_KERNEL_LDS +#undef EMIT_VVAR +#endif + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -43,7 +57,17 @@ SECTIONS */ . = ALIGN(0x100); - .text : { *(.text*) } :text =0x90909090 + .text : { *(.text*) } :text =0x90909090, + + /* + * The comma above works around a bug in gold: + * https://sourceware.org/bugzilla/show_bug.cgi?id=16804 + */ + + /DISCARD/ : { + *(.discard) + *(.discard.*) + } } /* diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S index 1e13eb8c965..be3f23b09af 100644 --- a/arch/x86/vdso/vdso.S +++ b/arch/x86/vdso/vdso.S @@ -1,21 +1,3 @@ -#include <asm/page_types.h> -#include <linux/linkage.h> +#include <asm/vdso.h> -__PAGE_ALIGNED_DATA - - .globl vdso_start, vdso_end - .align PAGE_SIZE -vdso_start: - .incbin "arch/x86/vdso/vdso.so" -vdso_end: - .align PAGE_SIZE /* extra data here leaks to userspace. */ - -.previous - - .globl vdso_pages - .bss - .align 8 - .type vdso_pages, @object -vdso_pages: - .zero (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE * 8 - .size vdso_pages, .-vdso_pages +DEFINE_VDSO_IMAGE(vdso, "arch/x86/vdso/vdso.so") diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index d6bfb876cfb..e1f220e3ca6 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -16,6 +16,7 @@ #include <linux/mm.h> #include <linux/err.h> #include <linux/module.h> +#include <linux/slab.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -25,32 +26,23 @@ #include <asm/tlbflush.h> #include <asm/vdso.h> #include <asm/proto.h> - -enum { - VDSO_DISABLED = 0, - VDSO_ENABLED = 1, - VDSO_COMPAT = 2, -}; +#include <asm/fixmap.h> +#include <asm/hpet.h> +#include <asm/vvar.h> #ifdef CONFIG_COMPAT_VDSO -#define VDSO_DEFAULT VDSO_COMPAT +#define VDSO_DEFAULT 0 #else -#define VDSO_DEFAULT VDSO_ENABLED +#define VDSO_DEFAULT 1 #endif #ifdef CONFIG_X86_64 #define vdso_enabled sysctl_vsyscall32 #define arch_setup_additional_pages syscall32_setup_pages +extern int sysctl_ldt16; #endif /* - * This is the difference between the prelinked addresses in the vDSO images - * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO - * in the user address space. - */ -#define VDSO_ADDR_ADJUST (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK) - -/* * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? */ @@ -60,6 +52,9 @@ static int __init vdso_setup(char *s) { vdso_enabled = simple_strtoul(s, NULL, 0); + if (vdso_enabled > 1) + pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); + return 1; } @@ -76,124 +71,8 @@ __setup_param("vdso=", vdso32_setup, vdso_setup, 0); EXPORT_SYMBOL_GPL(vdso_enabled); #endif -static __init void reloc_symtab(Elf32_Ehdr *ehdr, - unsigned offset, unsigned size) -{ - Elf32_Sym *sym = (void *)ehdr + offset; - unsigned nsym = size / sizeof(*sym); - unsigned i; - - for(i = 0; i < nsym; i++, sym++) { - if (sym->st_shndx == SHN_UNDEF || - sym->st_shndx == SHN_ABS) - continue; /* skip */ - - if (sym->st_shndx > SHN_LORESERVE) { - printk(KERN_INFO "VDSO: unexpected st_shndx %x\n", - sym->st_shndx); - continue; - } - - switch(ELF_ST_TYPE(sym->st_info)) { - case STT_OBJECT: - case STT_FUNC: - case STT_SECTION: - case STT_FILE: - sym->st_value += VDSO_ADDR_ADJUST; - } - } -} - -static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset) -{ - Elf32_Dyn *dyn = (void *)ehdr + offset; - - for(; dyn->d_tag != DT_NULL; dyn++) - switch(dyn->d_tag) { - case DT_PLTGOT: - case DT_HASH: - case DT_STRTAB: - case DT_SYMTAB: - case DT_RELA: - case DT_INIT: - case DT_FINI: - case DT_REL: - case DT_DEBUG: - case DT_JMPREL: - case DT_VERSYM: - case DT_VERDEF: - case DT_VERNEED: - case DT_ADDRRNGLO ... DT_ADDRRNGHI: - /* definitely pointers needing relocation */ - dyn->d_un.d_ptr += VDSO_ADDR_ADJUST; - break; - - case DT_ENCODING ... OLD_DT_LOOS-1: - case DT_LOOS ... DT_HIOS-1: - /* Tags above DT_ENCODING are pointers if - they're even */ - if (dyn->d_tag >= DT_ENCODING && - (dyn->d_tag & 1) == 0) - dyn->d_un.d_ptr += VDSO_ADDR_ADJUST; - break; - - case DT_VERDEFNUM: - case DT_VERNEEDNUM: - case DT_FLAGS_1: - case DT_RELACOUNT: - case DT_RELCOUNT: - case DT_VALRNGLO ... DT_VALRNGHI: - /* definitely not pointers */ - break; - - case OLD_DT_LOOS ... DT_LOOS-1: - case DT_HIOS ... DT_VALRNGLO-1: - default: - if (dyn->d_tag > DT_ENCODING) - printk(KERN_INFO "VDSO: unexpected DT_tag %x\n", - dyn->d_tag); - break; - } -} - -static __init void relocate_vdso(Elf32_Ehdr *ehdr) -{ - Elf32_Phdr *phdr; - Elf32_Shdr *shdr; - int i; - - BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 || - !elf_check_arch_ia32(ehdr) || - ehdr->e_type != ET_DYN); - - ehdr->e_entry += VDSO_ADDR_ADJUST; - - /* rebase phdrs */ - phdr = (void *)ehdr + ehdr->e_phoff; - for (i = 0; i < ehdr->e_phnum; i++) { - phdr[i].p_vaddr += VDSO_ADDR_ADJUST; - - /* relocate dynamic stuff */ - if (phdr[i].p_type == PT_DYNAMIC) - reloc_dyn(ehdr, phdr[i].p_offset); - } - - /* rebase sections */ - shdr = (void *)ehdr + ehdr->e_shoff; - for(i = 0; i < ehdr->e_shnum; i++) { - if (!(shdr[i].sh_flags & SHF_ALLOC)) - continue; - - shdr[i].sh_addr += VDSO_ADDR_ADJUST; - - if (shdr[i].sh_type == SHT_SYMTAB || - shdr[i].sh_type == SHT_DYNSYM) - reloc_symtab(ehdr, shdr[i].sh_offset, - shdr[i].sh_size); - } -} - -static struct page *vdso32_pages[1]; +static struct page **vdso32_pages; +static unsigned vdso32_size; #ifdef CONFIG_X86_64 @@ -212,12 +91,6 @@ void syscall32_cpu_init(void) wrmsrl(MSR_CSTAR, ia32_cstar_target); } -#define compat_uses_vma 1 - -static inline void map_compat_vdso(int map) -{ -} - #else /* CONFIG_X86_32 */ #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) @@ -241,64 +114,36 @@ void enable_sep_cpu(void) put_cpu(); } -static struct vm_area_struct gate_vma; - -static int __init gate_vma_init(void) -{ - gate_vma.vm_mm = NULL; - gate_vma.vm_start = FIXADDR_USER_START; - gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; - gate_vma.vm_page_prot = __P101; - - return 0; -} - -#define compat_uses_vma 0 - -static void map_compat_vdso(int map) -{ - static int vdso_mapped; - - if (map == vdso_mapped) - return; - - vdso_mapped = map; - - __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT, - map ? PAGE_READONLY_EXEC : PAGE_NONE); - - /* flush stray tlbs */ - flush_tlb_all(); -} - #endif /* CONFIG_X86_64 */ int __init sysenter_setup(void) { - void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); - const void *vsyscall; - size_t vsyscall_len; - - vdso32_pages[0] = virt_to_page(syscall_page); - -#ifdef CONFIG_X86_32 - gate_vma_init(); -#endif + char *vdso32_start, *vdso32_end; + int npages, i; +#ifdef CONFIG_COMPAT if (vdso32_syscall()) { - vsyscall = &vdso32_syscall_start; - vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start; - } else if (vdso32_sysenter()){ - vsyscall = &vdso32_sysenter_start; - vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; + vdso32_start = vdso32_syscall_start; + vdso32_end = vdso32_syscall_end; + vdso32_pages = vdso32_syscall_pages; + } else +#endif + if (vdso32_sysenter()) { + vdso32_start = vdso32_sysenter_start; + vdso32_end = vdso32_sysenter_end; + vdso32_pages = vdso32_sysenter_pages; } else { - vsyscall = &vdso32_int80_start; - vsyscall_len = &vdso32_int80_end - &vdso32_int80_start; + vdso32_start = vdso32_int80_start; + vdso32_end = vdso32_int80_end; + vdso32_pages = vdso32_int80_pages; } - memcpy(syscall_page, vsyscall, vsyscall_len); - relocate_vdso(syscall_page); + npages = ((vdso32_end - vdso32_start) + PAGE_SIZE - 1) / PAGE_SIZE; + vdso32_size = npages << PAGE_SHIFT; + for (i = 0; i < npages; i++) + vdso32_pages[i] = virt_to_page(vdso32_start + i*PAGE_SIZE); + + patch_vdso32(vdso32_start, vdso32_size); return 0; } @@ -309,48 +154,73 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) struct mm_struct *mm = current->mm; unsigned long addr; int ret = 0; - bool compat; + struct vm_area_struct *vma; #ifdef CONFIG_X86_X32_ABI if (test_thread_flag(TIF_X32)) return x32_setup_additional_pages(bprm, uses_interp); #endif - if (vdso_enabled == VDSO_DISABLED) + if (vdso_enabled != 1) /* Other values all mean "disabled" */ return 0; down_write(&mm->mmap_sem); - /* Test compat mode once here, in case someone - changes it via sysctl */ - compat = (vdso_enabled == VDSO_COMPAT); + addr = get_unmapped_area(NULL, 0, vdso32_size + VDSO_OFFSET(VDSO_PREV_PAGES), 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } - map_compat_vdso(compat); + addr += VDSO_OFFSET(VDSO_PREV_PAGES); - if (compat) - addr = VDSO_HIGH_BASE; - else { - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto up_fail; - } + current->mm->context.vdso = (void *)addr; + + /* + * MAYWRITE to allow gdb to COW and set breakpoints + */ + ret = install_special_mapping(mm, + addr, + vdso32_size, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + vdso32_pages); + + if (ret) + goto up_fail; + + vma = _install_special_mapping(mm, + addr - VDSO_OFFSET(VDSO_PREV_PAGES), + VDSO_OFFSET(VDSO_PREV_PAGES), + VM_READ, + NULL); + + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto up_fail; } - current->mm->context.vdso = (void *)addr; + ret = remap_pfn_range(vma, + addr - VDSO_OFFSET(VDSO_VVAR_PAGE), + __pa_symbol(&__vvar_page) >> PAGE_SHIFT, + PAGE_SIZE, + PAGE_READONLY); + + if (ret) + goto up_fail; - if (compat_uses_vma || !compat) { - /* - * MAYWRITE to allow gdb to COW and set breakpoints - */ - ret = install_special_mapping(mm, addr, PAGE_SIZE, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - vdso32_pages); +#ifdef CONFIG_HPET_TIMER + if (hpet_address) { + ret = io_remap_pfn_range(vma, + addr - VDSO_OFFSET(VDSO_HPET_PAGE), + hpet_address >> PAGE_SHIFT, + PAGE_SIZE, + pgprot_noncached(PAGE_READONLY)); if (ret) goto up_fail; } +#endif current_thread_info()->sysenter_return = VDSO32_SYMBOL(addr, SYSENTER_RETURN); @@ -380,6 +250,13 @@ static struct ctl_table abi_table2[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "ldt16", + .data = &sysctl_ldt16, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, {} }; @@ -411,20 +288,12 @@ const char *arch_vma_name(struct vm_area_struct *vma) struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { - /* - * Check to see if the corresponding task was created in compat vdso - * mode. - */ - if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) - return &gate_vma; return NULL; } int in_gate_area(struct mm_struct *mm, unsigned long addr) { - const struct vm_area_struct *vma = get_gate_vma(mm); - - return vma && addr >= vma->vm_start && addr < vma->vm_end; + return 0; } int in_gate_area_no_mm(unsigned long addr) diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S index 2ce5f82c333..018bcd9f97b 100644 --- a/arch/x86/vdso/vdso32.S +++ b/arch/x86/vdso/vdso32.S @@ -1,22 +1,9 @@ -#include <linux/init.h> +#include <asm/vdso.h> -__INITDATA +DEFINE_VDSO_IMAGE(vdso32_int80, "arch/x86/vdso/vdso32-int80.so") - .globl vdso32_int80_start, vdso32_int80_end -vdso32_int80_start: - .incbin "arch/x86/vdso/vdso32-int80.so" -vdso32_int80_end: - - .globl vdso32_syscall_start, vdso32_syscall_end -vdso32_syscall_start: #ifdef CONFIG_COMPAT - .incbin "arch/x86/vdso/vdso32-syscall.so" +DEFINE_VDSO_IMAGE(vdso32_syscall, "arch/x86/vdso/vdso32-syscall.so") #endif -vdso32_syscall_end: - - .globl vdso32_sysenter_start, vdso32_sysenter_end -vdso32_sysenter_start: - .incbin "arch/x86/vdso/vdso32-sysenter.so" -vdso32_sysenter_end: -__FINIT +DEFINE_VDSO_IMAGE(vdso32_sysenter, "arch/x86/vdso/vdso32-sysenter.so") diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c new file mode 100644 index 00000000000..175cc72c0f6 --- /dev/null +++ b/arch/x86/vdso/vdso32/vclock_gettime.c @@ -0,0 +1,30 @@ +#define BUILD_VDSO32 + +#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE +#undef CONFIG_OPTIMIZE_INLINING +#endif + +#undef CONFIG_X86_PPRO_FENCE + +#ifdef CONFIG_X86_64 + +/* + * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel + * configuration + */ +#undef CONFIG_64BIT +#undef CONFIG_X86_64 +#undef CONFIG_ILLEGAL_POINTER_VALUE +#undef CONFIG_SPARSEMEM_VMEMMAP +#undef CONFIG_NR_CPUS + +#define CONFIG_X86_32 1 +#define CONFIG_PAGE_OFFSET 0 +#define CONFIG_ILLEGAL_POINTER_VALUE 0 +#define CONFIG_NR_CPUS 1 + +#define BUILD_VDSO32_64 + +#endif + +#include "../vclock_gettime.c" diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S index 976124bb5f9..aadb8b9994c 100644 --- a/arch/x86/vdso/vdso32/vdso32.lds.S +++ b/arch/x86/vdso/vdso32/vdso32.lds.S @@ -8,7 +8,11 @@ * values visible using the asm-x86/vdso.h macros from the kernel proper. */ +#include <asm/page.h> + +#define BUILD_VDSO32 #define VDSO_PRELINK 0 + #include "../vdso-layout.lds.S" /* The ELF entry point can be used to set the AT_SYSINFO value. */ @@ -19,6 +23,13 @@ ENTRY(__kernel_vsyscall); */ VERSION { + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_time; + }; + LINUX_2.5 { global: __kernel_vsyscall; @@ -31,7 +42,9 @@ VERSION /* * Symbols we define here called VDSO* get their values into vdso32-syms.h. */ -VDSO32_PRELINK = VDSO_PRELINK; VDSO32_vsyscall = __kernel_vsyscall; VDSO32_sigreturn = __kernel_sigreturn; VDSO32_rt_sigreturn = __kernel_rt_sigreturn; +VDSO32_clock_gettime = clock_gettime; +VDSO32_gettimeofday = gettimeofday; +VDSO32_time = time; diff --git a/arch/x86/vdso/vdsox32.S b/arch/x86/vdso/vdsox32.S index 295f1c7543d..f4aa34e7f37 100644 --- a/arch/x86/vdso/vdsox32.S +++ b/arch/x86/vdso/vdsox32.S @@ -1,21 +1,3 @@ -#include <asm/page_types.h> -#include <linux/linkage.h> +#include <asm/vdso.h> -__PAGE_ALIGNED_DATA - - .globl vdsox32_start, vdsox32_end - .align PAGE_SIZE -vdsox32_start: - .incbin "arch/x86/vdso/vdsox32.so" -vdsox32_end: - .align PAGE_SIZE /* extra data here leaks to userspace. */ - -.previous - - .globl vdsox32_pages - .bss - .align 8 - .type vdsox32_pages, @object -vdsox32_pages: - .zero (vdsox32_end - vdsox32_start + PAGE_SIZE - 1) / PAGE_SIZE * 8 - .size vdsox32_pages, .-vdsox32_pages +DEFINE_VDSO_IMAGE(vdsox32, "arch/x86/vdso/vdsox32.so") diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 431e8754441..1ad10261312 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -16,20 +16,22 @@ #include <asm/vdso.h> #include <asm/page.h> +#if defined(CONFIG_X86_64) unsigned int __read_mostly vdso_enabled = 1; -extern char vdso_start[], vdso_end[]; +DECLARE_VDSO_IMAGE(vdso); extern unsigned short vdso_sync_cpuid; - -extern struct page *vdso_pages[]; static unsigned vdso_size; #ifdef CONFIG_X86_X32_ABI -extern char vdsox32_start[], vdsox32_end[]; -extern struct page *vdsox32_pages[]; +DECLARE_VDSO_IMAGE(vdsox32); static unsigned vdsox32_size; +#endif +#endif -static void __init patch_vdsox32(void *vdso, size_t len) +#if defined(CONFIG_X86_32) || defined(CONFIG_X86_X32_ABI) || \ + defined(CONFIG_COMPAT) +void __init patch_vdso32(void *vdso, size_t len) { Elf32_Ehdr *hdr = vdso; Elf32_Shdr *sechdrs, *alt_sec = 0; @@ -52,7 +54,7 @@ static void __init patch_vdsox32(void *vdso, size_t len) } /* If we get here, it's probably a bug. */ - pr_warning("patch_vdsox32: .altinstructions not found\n"); + pr_warning("patch_vdso32: .altinstructions not found\n"); return; /* nothing to patch */ found: @@ -61,6 +63,7 @@ found: } #endif +#if defined(CONFIG_X86_64) static void __init patch_vdso64(void *vdso, size_t len) { Elf64_Ehdr *hdr = vdso; @@ -104,7 +107,7 @@ static int __init init_vdso(void) vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE); #ifdef CONFIG_X86_X32_ABI - patch_vdsox32(vdsox32_start, vdsox32_end - vdsox32_start); + patch_vdso32(vdsox32_start, vdsox32_end - vdsox32_start); npages = (vdsox32_end - vdsox32_start + PAGE_SIZE - 1) / PAGE_SIZE; vdsox32_size = npages << PAGE_SHIFT; for (i = 0; i < npages; i++) @@ -204,3 +207,4 @@ static __init int vdso_setup(char *s) return 0; } __setup("vdso=", vdso_setup); +#endif diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 01b90261fa3..e88fda867a3 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -7,7 +7,7 @@ config XEN depends on PARAVIRT select PARAVIRT_CLOCK select XEN_HAVE_PVMMU - depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS) + depends on X86_64 || (X86_32 && X86_PAE) depends on X86_TSC help This is the Linux Xen port. Enabling this will allow the @@ -19,11 +19,6 @@ config XEN_DOM0 depends on XEN && PCI_XEN && SWIOTLB_XEN depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI -# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST -# name in tools. -config XEN_PRIVILEGED_GUEST - def_bool XEN_DOM0 - config XEN_PVHVM def_bool y depends on XEN && PCI && X86_LOCAL_APIC diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 201d09a7c46..f17b29210ac 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1339,6 +1339,7 @@ xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) static struct notifier_block xen_panic_block = { .notifier_call= xen_panic_event, + .priority = INT_MIN }; int xen_panic_handler_init(void) @@ -1515,7 +1516,7 @@ static void __init xen_pvh_early_guest_init(void) } /* First C function to be called on Xen boot */ -asmlinkage void __init xen_start_kernel(void) +asmlinkage __visible void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; int rc; diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 08f763de26f..a1207cb6472 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -23,7 +23,7 @@ void xen_force_evtchn_callback(void) (void)HYPERVISOR_xen_version(0, NULL); } -asmlinkage unsigned long xen_save_fl(void) +asmlinkage __visible unsigned long xen_save_fl(void) { struct vcpu_info *vcpu; unsigned long flags; @@ -63,7 +63,7 @@ __visible void xen_restore_fl(unsigned long flags) } PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); -asmlinkage void xen_irq_disable(void) +asmlinkage __visible void xen_irq_disable(void) { /* There's a one instruction preempt window here. We need to make sure we're don't switch CPUs between getting the vcpu @@ -74,7 +74,7 @@ asmlinkage void xen_irq_disable(void) } PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); -asmlinkage void xen_irq_enable(void) +asmlinkage __visible void xen_irq_enable(void) { struct vcpu_info *vcpu; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 2423ef04ffe..6f6e15d2846 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2058,7 +2058,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) case FIX_RO_IDT: #ifdef CONFIG_X86_32 case FIX_WP_TEST: - case FIX_VDSO: # ifdef CONFIG_HIGHMEM case FIX_KMAP_BEGIN ... FIX_KMAP_END: # endif @@ -2511,6 +2510,95 @@ void __init xen_hvm_init_mmu_ops(void) } #endif +#ifdef CONFIG_XEN_PVH +/* + * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user + * space creating new guest on pvh dom0 and needing to map domU pages. + */ +static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn, + unsigned int domid) +{ + int rc, err = 0; + xen_pfn_t gpfn = lpfn; + xen_ulong_t idx = fgfn; + + struct xen_add_to_physmap_range xatp = { + .domid = DOMID_SELF, + .foreign_domid = domid, + .size = 1, + .space = XENMAPSPACE_gmfn_foreign, + }; + set_xen_guest_handle(xatp.idxs, &idx); + set_xen_guest_handle(xatp.gpfns, &gpfn); + set_xen_guest_handle(xatp.errs, &err); + + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); + if (rc < 0) + return rc; + return err; +} + +static int xlate_remove_from_p2m(unsigned long spfn, int count) +{ + struct xen_remove_from_physmap xrp; + int i, rc; + + for (i = 0; i < count; i++) { + xrp.domid = DOMID_SELF; + xrp.gpfn = spfn+i; + rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); + if (rc) + break; + } + return rc; +} + +struct xlate_remap_data { + unsigned long fgfn; /* foreign domain's gfn */ + pgprot_t prot; + domid_t domid; + int index; + struct page **pages; +}; + +static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, + void *data) +{ + int rc; + struct xlate_remap_data *remap = data; + unsigned long pfn = page_to_pfn(remap->pages[remap->index++]); + pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot)); + + rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid); + if (rc) + return rc; + native_set_pte(ptep, pteval); + + return 0; +} + +static int xlate_remap_gfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long mfn, + int nr, pgprot_t prot, unsigned domid, + struct page **pages) +{ + int err; + struct xlate_remap_data pvhdata; + + BUG_ON(!pages); + + pvhdata.fgfn = mfn; + pvhdata.prot = prot; + pvhdata.domid = domid; + pvhdata.index = 0; + pvhdata.pages = pages; + err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, + xlate_map_pte_fn, &pvhdata); + flush_tlb_all(); + return err; +} +#endif + #define REMAP_BATCH_SIZE 16 struct remap_data { @@ -2523,7 +2611,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, void *data) { struct remap_data *rmd = data; - pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); + pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot)); rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; rmd->mmu_update->val = pte_val_ma(pte); @@ -2545,13 +2633,18 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long range; int err = 0; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return -EINVAL; - - prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); - BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); + if (xen_feature(XENFEAT_auto_translated_physmap)) { +#ifdef CONFIG_XEN_PVH + /* We need to update the local page tables and the xen HAP */ + return xlate_remap_gfn_range(vma, addr, mfn, nr, prot, + domid, pages); +#else + return -EINVAL; +#endif + } + rmd.mfn = mfn; rmd.prot = prot; @@ -2589,6 +2682,25 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) return 0; +#ifdef CONFIG_XEN_PVH + while (numpgs--) { + /* + * The mmu has already cleaned up the process mmu + * resources at this point (lookup_address will return + * NULL). + */ + unsigned long pfn = page_to_pfn(pages[numpgs]); + + xlate_remove_from_p2m(pfn, 1); + } + /* + * We don't need to flush tlbs because as part of + * xlate_remove_from_p2m, the hypervisor will do tlb flushes + * after removing the p2m entries from the EPT/NPT + */ + return 0; +#else return -EINVAL; +#endif } EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 696c694986d..9bb3d82ffec 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -36,7 +36,7 @@ * pfn_to_mfn(0xc0000)=0xc0000 * * The benefit of this is, that we can assume for non-RAM regions (think - * PCI BARs, or ACPI spaces), we can create mappings easily b/c we + * PCI BARs, or ACPI spaces), we can create mappings easily because we * get the PFN value to match the MFN. * * For this to work efficiently we have one new page p2m_identity and @@ -60,7 +60,7 @@ * There is also a digram of the P2M at the end that can help. * Imagine your E820 looking as so: * - * 1GB 2GB + * 1GB 2GB 4GB * /-------------------+---------\/----\ /----------\ /---+-----\ * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM | * \-------------------+---------/\----/ \----------/ \---+-----/ @@ -77,9 +77,8 @@ * of the PFN and the end PFN (263424 and 512256 respectively). The first step * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page * covers 512^2 of page estate (1GB) and in case the start or end PFN is not - * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn - * to end pfn. We reserve_brk top leaf pages if they are missing (means they - * point to p2m_mid_missing). + * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as + * required to split any existing p2m_mid_missing middle pages. * * With the E820 example above, 263424 is not 1GB aligned so we allocate a * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000. @@ -88,7 +87,7 @@ * Next stage is to determine if we need to do a more granular boundary check * on the 4MB (or 2MB depending on architecture) off the start and end pfn's. * We check if the start pfn and end pfn violate that boundary check, and if - * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer + * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer * granularity of setting which PFNs are missing and which ones are identity. * In our example 263424 and 512256 both fail the check so we reserve_brk two * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing" @@ -102,9 +101,10 @@ * * The next step is to walk from the start pfn to the end pfn setting * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity. - * If we find that the middle leaf is pointing to p2m_missing we can swap it - * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this - * point we do not need to worry about boundary aligment (so no need to + * If we find that the middle entry is pointing to p2m_missing we can swap it + * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and + * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions). + * At this point we do not need to worry about boundary aligment (so no need to * reserve_brk a middle page, figure out which PFNs are "missing" and which * ones are identity), as that has been done earlier. If we find that the * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference @@ -118,6 +118,9 @@ * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511] * contain the INVALID_P2M_ENTRY value and are considered "missing." * + * Finally, the region beyond the end of of the E820 (4 GB in this example) + * is set to be identity (in case there are MMIO regions placed here). + * * This is what the p2m ends up looking (for the E820 above) with this * fabulous drawing: * @@ -129,21 +132,27 @@ * |-----| \ | [p2m_identity]+\\ | .... | * | 2 |--\ \-------------------->| ... | \\ \----------------/ * |-----| \ \---------------/ \\ - * | 3 |\ \ \\ p2m_identity - * |-----| \ \-------------------->/---------------\ /-----------------\ - * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... | - * \-----/ / | [p2m_identity]+-->| ..., ~0 | - * / /---------------\ | .... | \-----------------/ - * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. | - * / | IDENTITY[@256]|<----/ \---------------/ - * / | ~0, ~0, .... | - * | \---------------/ - * | - * p2m_mid_missing p2m_missing - * /-----------------\ /------------\ - * | [p2m_missing] +---->| ~0, ~0, ~0 | - * | [p2m_missing] +---->| ..., ~0 | - * \-----------------/ \------------/ + * | 3 |-\ \ \\ p2m_identity [1] + * |-----| \ \-------------------->/---------------\ /-----------------\ + * | .. |\ | | [p2m_identity]+-->| ~0, ~0, ~0, ... | + * \-----/ | | | [p2m_identity]+-->| ..., ~0 | + * | | | .... | \-----------------/ + * | | +-[x], ~0, ~0.. +\ + * | | \---------------/ \ + * | | \-> /---------------\ + * | V p2m_mid_missing p2m_missing | IDENTITY[@0] | + * | /-----------------\ /------------\ | IDENTITY[@256]| + * | | [p2m_missing] +---->| ~0, ~0, ...| | ~0, ~0, .... | + * | | [p2m_missing] +---->| ..., ~0 | \---------------/ + * | | ... | \------------/ + * | \-----------------/ + * | + * | p2m_mid_identity + * | /-----------------\ + * \-->| [p2m_identity] +---->[1] + * | [p2m_identity] +---->[1] + * | ... | + * \-----------------/ * * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) */ @@ -187,13 +196,15 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE); RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); /* We might hit two boundary violations at the start and end, at max each * boundary violation will require three middle nodes. */ -RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); +RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3); /* When we populate back during bootup, the amount of pages can vary. The * max we have is seen is 395979, but that does not mean it can't be more. @@ -242,20 +253,20 @@ static void p2m_top_mfn_p_init(unsigned long **top) top[i] = p2m_mid_missing_mfn; } -static void p2m_mid_init(unsigned long **mid) +static void p2m_mid_init(unsigned long **mid, unsigned long *leaf) { unsigned i; for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = p2m_missing; + mid[i] = leaf; } -static void p2m_mid_mfn_init(unsigned long *mid) +static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf) { unsigned i; for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = virt_to_mfn(p2m_missing); + mid[i] = virt_to_mfn(leaf); } static void p2m_init(unsigned long *p2m) @@ -286,7 +297,9 @@ void __ref xen_build_mfn_list_list(void) /* Pre-initialize p2m_top_mfn to be completely missing */ if (p2m_top_mfn == NULL) { p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(p2m_mid_missing_mfn); + p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); + p2m_mid_identity_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_top_mfn_p_init(p2m_top_mfn_p); @@ -295,7 +308,8 @@ void __ref xen_build_mfn_list_list(void) p2m_top_mfn_init(p2m_top_mfn); } else { /* Reinitialise, mfn's all change after migration */ - p2m_mid_mfn_init(p2m_mid_missing_mfn); + p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); + p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); } for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { @@ -327,7 +341,7 @@ void __ref xen_build_mfn_list_list(void) * it too late. */ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p); + p2m_mid_mfn_init(mid_mfn_p, p2m_missing); p2m_top_mfn_p[topidx] = mid_mfn_p; } @@ -365,16 +379,17 @@ void __init xen_build_dynamic_phys_to_machine(void) p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_init(p2m_missing); + p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_identity); p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(p2m_mid_missing); + p2m_mid_init(p2m_mid_missing, p2m_missing); + p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(p2m_mid_identity, p2m_identity); p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_top_init(p2m_top); - p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_init(p2m_identity); - /* * The domain builder gives us a pre-constructed p2m array in * mfn_list for all the pages initially given to us, so we just @@ -386,7 +401,7 @@ void __init xen_build_dynamic_phys_to_machine(void) if (p2m_top[topidx] == p2m_mid_missing) { unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(mid); + p2m_mid_init(mid, p2m_missing); p2m_top[topidx] = mid; } @@ -492,7 +507,7 @@ unsigned long get_phys_to_machine(unsigned long pfn) unsigned topidx, mididx, idx; if (unlikely(pfn >= MAX_P2M_PFN)) - return INVALID_P2M_ENTRY; + return IDENTITY_FRAME(pfn); topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); @@ -545,7 +560,7 @@ static bool alloc_p2m(unsigned long pfn) if (!mid) return false; - p2m_mid_init(mid); + p2m_mid_init(mid, p2m_missing); if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) free_p2m_page(mid); @@ -565,7 +580,7 @@ static bool alloc_p2m(unsigned long pfn) if (!mid_mfn) return false; - p2m_mid_mfn_init(mid_mfn); + p2m_mid_mfn_init(mid_mfn, p2m_missing); missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); mid_mfn_mfn = virt_to_mfn(mid_mfn); @@ -596,7 +611,7 @@ static bool alloc_p2m(unsigned long pfn) return true; } -static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary) +static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) { unsigned topidx, mididx, idx; unsigned long *p2m; @@ -638,7 +653,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary return true; } -static bool __init early_alloc_p2m(unsigned long pfn) +static bool __init early_alloc_p2m_middle(unsigned long pfn) { unsigned topidx = p2m_top_index(pfn); unsigned long *mid_mfn_p; @@ -649,7 +664,7 @@ static bool __init early_alloc_p2m(unsigned long pfn) if (mid == p2m_mid_missing) { mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(mid); + p2m_mid_init(mid, p2m_missing); p2m_top[topidx] = mid; @@ -658,12 +673,12 @@ static bool __init early_alloc_p2m(unsigned long pfn) /* And the save/restore P2M tables.. */ if (mid_mfn_p == p2m_mid_missing_mfn) { mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p); + p2m_mid_mfn_init(mid_mfn_p, p2m_missing); p2m_top_mfn_p[topidx] = mid_mfn_p; p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); /* Note: we don't set mid_mfn_p[midix] here, - * look in early_alloc_p2m_middle */ + * look in early_alloc_p2m() */ } return true; } @@ -739,7 +754,7 @@ found: /* This shouldn't happen */ if (WARN_ON(p2m_top[topidx] == p2m_mid_missing)) - early_alloc_p2m(set_pfn); + early_alloc_p2m_middle(set_pfn); if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing)) return false; @@ -754,13 +769,13 @@ found: bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - if (!early_alloc_p2m(pfn)) + if (!early_alloc_p2m_middle(pfn)) return false; if (early_can_reuse_p2m_middle(pfn, mfn)) return __set_phys_to_machine(pfn, mfn); - if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/)) + if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/)) return false; if (!__set_phys_to_machine(pfn, mfn)) @@ -769,12 +784,30 @@ bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) return true; } + +static void __init early_split_p2m(unsigned long pfn) +{ + unsigned long mididx, idx; + + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + /* + * Allocate new middle and leaf pages if this pfn lies in the + * middle of one. + */ + if (mididx || idx) + early_alloc_p2m_middle(pfn); + if (idx) + early_alloc_p2m(pfn, false); +} + unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) { unsigned long pfn; - if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN)) + if (unlikely(pfn_s >= MAX_P2M_PFN)) return 0; if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) @@ -783,19 +816,30 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, if (pfn_s > pfn_e) return 0; - for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1)); - pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); - pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) - { - WARN_ON(!early_alloc_p2m(pfn)); - } + if (pfn_e > MAX_P2M_PFN) + pfn_e = MAX_P2M_PFN; - early_alloc_p2m_middle(pfn_s, true); - early_alloc_p2m_middle(pfn_e, true); + early_split_p2m(pfn_s); + early_split_p2m(pfn_e); + + for (pfn = pfn_s; pfn < pfn_e;) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); - for (pfn = pfn_s; pfn < pfn_e; pfn++) if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) break; + pfn++; + + /* + * If the PFN was set to a middle or leaf identity + * page the remainder must also be identity, so skip + * ahead to the next middle or leaf entry. + */ + if (p2m_top[topidx] == p2m_mid_identity) + pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE); + else if (p2m_top[topidx][mididx] == p2m_identity) + pfn = ALIGN(pfn, P2M_PER_PAGE); + } if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s), "Identity mapping failed. We are %ld short of 1-1 mappings!\n", @@ -825,8 +869,22 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) /* For sparse holes were the p2m leaf has real PFN along with * PCI holes, stick in the PFN as the MFN value. + * + * set_phys_range_identity() will have allocated new middle + * and leaf pages as required so an existing p2m_mid_missing + * or p2m_missing mean that whole range will be identity so + * these can be switched to p2m_mid_identity or p2m_identity. */ if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) { + if (p2m_top[topidx] == p2m_mid_identity) + return true; + + if (p2m_top[topidx] == p2m_mid_missing) { + WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing, + p2m_mid_identity) != p2m_mid_missing); + return true; + } + if (p2m_top[topidx][mididx] == p2m_identity) return true; @@ -881,6 +939,65 @@ static unsigned long mfn_hash(unsigned long mfn) return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); } +int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) +{ + int i, ret = 0; + bool lazy = false; + pte_t *pte; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (kmap_ops && + !in_interrupt() && + paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { + arch_enter_lazy_mmu_mode(); + lazy = true; + } + + for (i = 0; i < count; i++) { + unsigned long mfn, pfn; + + /* Do not add to override if the map failed. */ + if (map_ops[i].status) + continue; + + if (map_ops[i].flags & GNTMAP_contains_pte) { + pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + + (map_ops[i].host_addr & ~PAGE_MASK)); + mfn = pte_mfn(*pte); + } else { + mfn = PFN_DOWN(map_ops[i].dev_bus_addr); + } + pfn = page_to_pfn(pages[i]); + + WARN_ON(PagePrivate(pages[i])); + SetPagePrivate(pages[i]); + set_page_private(pages[i], mfn); + pages[i]->index = pfn_to_mfn(pfn); + + if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) { + ret = -ENOMEM; + goto out; + } + + if (kmap_ops) { + ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]); + if (ret) + goto out; + } + } + +out: + if (lazy) + arch_leave_lazy_mmu_mode(); + + return ret; +} +EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); + /* Add an MFN override for a particular page */ int m2p_add_override(unsigned long mfn, struct page *page, struct gnttab_map_grant_ref *kmap_op) @@ -899,13 +1016,6 @@ int m2p_add_override(unsigned long mfn, struct page *page, "m2p_add_override: pfn %lx not mapped", pfn)) return -EINVAL; } - WARN_ON(PagePrivate(page)); - SetPagePrivate(page); - set_page_private(page, mfn); - page->index = pfn_to_mfn(pfn); - - if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) - return -ENOMEM; if (kmap_op != NULL) { if (!PageHighMem(page)) { @@ -943,20 +1053,62 @@ int m2p_add_override(unsigned long mfn, struct page *page, return 0; } EXPORT_SYMBOL_GPL(m2p_add_override); + +int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) +{ + int i, ret = 0; + bool lazy = false; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (kmap_ops && + !in_interrupt() && + paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { + arch_enter_lazy_mmu_mode(); + lazy = true; + } + + for (i = 0; i < count; i++) { + unsigned long mfn = get_phys_to_machine(page_to_pfn(pages[i])); + unsigned long pfn = page_to_pfn(pages[i]); + + if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) { + ret = -EINVAL; + goto out; + } + + set_page_private(pages[i], INVALID_P2M_ENTRY); + WARN_ON(!PagePrivate(pages[i])); + ClearPagePrivate(pages[i]); + set_phys_to_machine(pfn, pages[i]->index); + + if (kmap_ops) + ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn); + if (ret) + goto out; + } + +out: + if (lazy) + arch_leave_lazy_mmu_mode(); + return ret; +} +EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); + int m2p_remove_override(struct page *page, - struct gnttab_map_grant_ref *kmap_op) + struct gnttab_map_grant_ref *kmap_op, + unsigned long mfn) { unsigned long flags; - unsigned long mfn; unsigned long pfn; unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; pfn = page_to_pfn(page); - mfn = get_phys_to_machine(pfn); - if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) - return -EINVAL; if (!PageHighMem(page)) { address = (unsigned long)__va(pfn << PAGE_SHIFT); @@ -970,10 +1122,7 @@ int m2p_remove_override(struct page *page, spin_lock_irqsave(&m2p_override_lock, flags); list_del(&page->lru); spin_unlock_irqrestore(&m2p_override_lock, flags); - WARN_ON(!PagePrivate(page)); - ClearPagePrivate(page); - set_phys_to_machine(pfn, page->index); if (kmap_op != NULL) { if (!PageHighMem(page)) { struct multicall_space mcs; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 0982233b9b8..210426a26cc 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -89,10 +89,10 @@ static void __init xen_add_extra_mem(u64 start, u64 size) for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); - if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) + if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) continue; - WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", - pfn, mfn); + WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", + pfn, mfn); __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } @@ -469,6 +469,15 @@ char * __init xen_memory_setup(void) } /* + * Set the rest as identity mapped, in case PCI BARs are + * located here. + * + * PFNs above MAX_P2M_PFN are considered identity mapped as + * well. + */ + set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); + + /* * In domU, the ISA region is normal, usable memory, but we * reserve ISA memory anyway because too many things poke * about in there. diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index a18eadd8bb4..7005974c3ff 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -441,10 +441,11 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) irq_ctx_init(cpu); #else clear_tsk_thread_flag(idle, TIF_FORK); +#endif per_cpu(kernel_stack, cpu) = (unsigned long)task_stack_page(idle) - KERNEL_STACK_OFFSET + THREAD_SIZE; -#endif + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_init_lock_cpu(cpu); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 581521c843a..0ba5f3b967f 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -183,7 +183,7 @@ __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) local_irq_save(flags); - kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); + kstat_incr_irq_this_cpu(irq); out: cpumask_clear_cpu(cpu, &waiting_cpus); w->lock = NULL; @@ -274,7 +274,7 @@ void __init xen_init_spinlocks(void) printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); return; } - + printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); pv_lock_ops.unlock_kick = xen_unlock_kick; } @@ -290,6 +290,9 @@ static __init int xen_init_spinlocks_jump(void) if (!xen_pvspin) return 0; + if (!xen_domain()) + return 0; + static_key_slow_inc(¶virt_ticketlocks_enabled); return 0; } diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 45329c8c226..c4df9dbd63b 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -12,8 +12,10 @@ #include "xen-ops.h" #include "mmu.h" -void xen_arch_pre_suspend(void) +static void xen_pv_pre_suspend(void) { + xen_mm_pin_all(); + xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); xen_start_info->console.domU.mfn = mfn_to_pfn(xen_start_info->console.domU.mfn); @@ -26,7 +28,7 @@ void xen_arch_pre_suspend(void) BUG(); } -void xen_arch_hvm_post_suspend(int suspend_cancelled) +static void xen_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; @@ -41,7 +43,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled) #endif } -void xen_arch_post_suspend(int suspend_cancelled) +static void xen_pv_post_suspend(int suspend_cancelled) { xen_build_mfn_list_list(); @@ -60,6 +62,21 @@ void xen_arch_post_suspend(int suspend_cancelled) xen_vcpu_restore(); } + xen_mm_unpin_all(); +} + +void xen_arch_pre_suspend(void) +{ + if (xen_pv_domain()) + xen_pv_pre_suspend(); +} + +void xen_arch_post_suspend(int cancelled) +{ + if (xen_pv_domain()) + xen_pv_post_suspend(cancelled); + else + xen_hvm_post_suspend(cancelled); } static void xen_vcpu_notify_restore(void *data) diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 33ca6e42a4c..fd92a64d748 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -75,6 +75,17 @@ ENDPROC(xen_sysexit) * stack state in whatever form its in, we keep things simple by only * using a single register which is pushed/popped on the stack. */ + +.macro POP_FS +1: + popw %fs +.pushsection .fixup, "ax" +2: movw $0, (%esp) + jmp 1b +.popsection + _ASM_EXTABLE(1b,2b) +.endm + ENTRY(xen_iret) /* test eflags for special cases */ testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) @@ -83,15 +94,13 @@ ENTRY(xen_iret) push %eax ESP_OFFSET=4 # bytes pushed onto stack - /* - * Store vcpu_info pointer for easy access. Do it this way to - * avoid having to reload %fs - */ + /* Store vcpu_info pointer for easy access */ #ifdef CONFIG_SMP - GET_THREAD_INFO(%eax) - movl %ss:TI_cpu(%eax), %eax - movl %ss:__per_cpu_offset(,%eax,4), %eax - mov %ss:xen_vcpu(%eax), %eax + pushw %fs + movl $(__KERNEL_PERCPU), %eax + movl %eax, %fs + movl %fs:xen_vcpu, %eax + POP_FS #else movl %ss:xen_vcpu, %eax #endif diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 1cb6f4c3730..c834d4b231f 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -31,6 +31,8 @@ void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_reserve_top(void); extern unsigned long xen_max_p2m_pfn; +void xen_mm_pin_all(void); +void xen_mm_unpin_all(void); void xen_set_pat(u64); char * __init xen_memory_setup(void); |