diff options
Diffstat (limited to 'arch/x86')
131 files changed, 2807 insertions, 1158 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f2327e88e07..666ac6651c1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -142,6 +142,10 @@ config INSTRUCTION_DECODER def_bool y depends on KPROBES || PERF_EVENTS || UPROBES +config PERF_EVENTS_INTEL_UNCORE + def_bool y + depends on PERF_EVENTS && CPU_SUP_INTEL && PCI + config OUTPUT_FORMAT string default "elf32-i386" if X86_32 @@ -244,6 +248,10 @@ config HAVE_INTEL_TXT def_bool y depends on INTEL_IOMMU && ACPI +config X86_INTEL_MPX + def_bool y + depends on CPU_SUP_INTEL + config X86_32_SMP def_bool y depends on X86_32 && SMP diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 704f58aa79c..d999398928b 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -3,6 +3,18 @@ # # create a compressed vmlinux image from the original vmlinux # +# vmlinuz is: +# decompression code (*.o) +# asm globals (piggy.S), including: +# vmlinux.bin.(gz|bz2|lzma|...) +# +# vmlinux.bin is: +# vmlinux stripped of debugging and comments +# vmlinux.bin.all is: +# vmlinux.bin + vmlinux.relocs +# vmlinux.bin.(gz|bz2|lzma|...) is: +# (see scripts/Makefile.lib size_append) +# compressed vmlinux.bin.all + u32 size of vmlinux.bin.all targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 @@ -35,7 +47,8 @@ vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/aslr.o $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone -vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o +vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \ + $(objtree)/drivers/firmware/efi/libstub/lib.a $(obj)/vmlinux: $(vmlinux-objs-y) FORCE $(call if_changed,ld) @@ -76,8 +89,10 @@ suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO) := lzo suffix-$(CONFIG_KERNEL_LZ4) := lz4 +RUN_SIZE = $(shell $(OBJDUMP) -h vmlinux | \ + perl $(srctree)/arch/x86/tools/calc_run_size.pl) quiet_cmd_mkpiggy = MKPIGGY $@ - cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) + cmd_mkpiggy = $(obj)/mkpiggy $< $(RUN_SIZE) > $@ || ( rm -f $@ ; false ) targets += piggy.S $(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index de8eebd6f67..92b9a5f2aed 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -21,8 +21,10 @@ static efi_system_table_t *sys_table; static struct efi_config *efi_early; -#define efi_call_early(f, ...) \ - efi_early->call(efi_early->f, __VA_ARGS__); +__pure const struct efi_config *__efi_early(void) +{ + return efi_early; +} #define BOOT_SERVICES(bits) \ static void setup_boot_services##bits(struct efi_config *c) \ @@ -285,8 +287,6 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) } } -#include "../../../../drivers/firmware/efi/libstub/efi-stub-helper.c" - static void find_bits(unsigned long mask, u8 *pos, u8 *size) { u8 first, len; @@ -330,8 +330,10 @@ __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom) size = pci->romsize + sizeof(*rom); status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to alloc mem for rom\n"); return status; + } memset(rom, 0, sizeof(*rom)); @@ -344,14 +346,18 @@ __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom) status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, PCI_VENDOR_ID, 1, &(rom->vendor)); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to read rom->vendor\n"); goto free_struct; + } status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, PCI_DEVICE_ID, 1, &(rom->devid)); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to read rom->devid\n"); goto free_struct; + } status = efi_early->call(pci->get_location, pci, &(rom->segment), &(rom->bus), &(rom->device), &(rom->function)); @@ -432,8 +438,10 @@ __setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom) size = pci->romsize + sizeof(*rom); status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to alloc mem for rom\n"); return status; + } rom->data.type = SETUP_PCI; rom->data.len = size - sizeof(struct setup_data); @@ -444,14 +452,18 @@ __setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom) status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, PCI_VENDOR_ID, 1, &(rom->vendor)); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to read rom->vendor\n"); goto free_struct; + } status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, PCI_DEVICE_ID, 1, &(rom->devid)); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to read rom->devid\n"); goto free_struct; + } status = efi_early->call(pci->get_location, pci, &(rom->segment), &(rom->bus), &(rom->device), &(rom->function)); @@ -538,8 +550,10 @@ static void setup_efi_pci(struct boot_params *params) EFI_LOADER_DATA, size, (void **)&pci_handle); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to alloc mem for pci_handle\n"); return; + } status = efi_call_early(locate_handle, EFI_LOCATE_BY_PROTOCOL, &pci_proto, @@ -1105,6 +1119,10 @@ struct boot_params *make_boot_params(struct efi_config *c) memset(sdt, 0, sizeof(*sdt)); + status = efi_parse_options(cmdline_ptr); + if (status != EFI_SUCCESS) + goto fail2; + status = handle_cmdline_files(sys_table, image, (char *)(unsigned long)hdr->cmd_line_ptr, "initrd=", hdr->initrd_addr_max, diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index c88c31ecad1..d487e727f1e 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -103,20 +103,4 @@ struct efi_uga_draw_protocol { void *blt; }; -struct efi_config { - u64 image_handle; - u64 table; - u64 allocate_pool; - u64 allocate_pages; - u64 get_memory_map; - u64 free_pool; - u64 free_pages; - u64 locate_handle; - u64 handle_protocol; - u64 exit_boot_services; - u64 text_output; - efi_status_t (*call)(unsigned long, ...); - bool is64; -} __packed; - #endif /* BOOT_COMPRESSED_EBOOT_H */ diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index cbed1407a5c..1d7fbbcc196 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -207,7 +207,8 @@ relocated: * Do the decompression, and jump to the new kernel.. */ /* push arguments for decompress_kernel: */ - pushl $z_output_len /* decompressed length */ + pushl $z_run_size /* size of kernel with .bss and .brk */ + pushl $z_output_len /* decompressed length, end of relocs */ leal z_extract_offset_negative(%ebx), %ebp pushl %ebp /* output address */ pushl $z_input_len /* input_len */ @@ -217,7 +218,7 @@ relocated: pushl %eax /* heap area */ pushl %esi /* real mode pointer */ call decompress_kernel /* returns kernel location in %eax */ - addl $24, %esp + addl $28, %esp /* * Jump to the decompressed kernel. diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 2884e0c3e8a..6b1766c6c08 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -402,13 +402,16 @@ relocated: * Do the decompression, and jump to the new kernel.. */ pushq %rsi /* Save the real mode argument */ + movq $z_run_size, %r9 /* size of kernel with .bss and .brk */ + pushq %r9 movq %rsi, %rdi /* real mode address */ leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ leaq input_data(%rip), %rdx /* input_data */ movl $z_input_len, %ecx /* input_len */ movq %rbp, %r8 /* output target address */ - movq $z_output_len, %r9 /* decompressed length */ + movq $z_output_len, %r9 /* decompressed length, end of relocs */ call decompress_kernel /* returns kernel location in %rax */ + popq %r9 popq %rsi /* diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 57ab74df7ee..dcc1c536cc2 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -260,7 +260,7 @@ static void handle_relocations(void *output, unsigned long output_len) /* * Process relocations: 32 bit relocations first then 64 bit after. - * Two sets of binary relocations are added to the end of the kernel + * Three sets of binary relocations are added to the end of the kernel * before compression. Each relocation table entry is the kernel * address of the location which needs to be updated stored as a * 32-bit value which is sign extended to 64 bits. @@ -270,6 +270,8 @@ static void handle_relocations(void *output, unsigned long output_len) * kernel bits... * 0 - zero terminator for 64 bit relocations * 64 bit relocation repeated + * 0 - zero terminator for inverse 32 bit relocations + * 32 bit inverse relocation repeated * 0 - zero terminator for 32 bit relocations * 32 bit relocation repeated * @@ -286,6 +288,16 @@ static void handle_relocations(void *output, unsigned long output_len) *(uint32_t *)ptr += delta; } #ifdef CONFIG_X86_64 + while (*--reloc) { + long extended = *reloc; + extended += map; + + ptr = (unsigned long)extended; + if (ptr < min_addr || ptr > max_addr) + error("inverse 32-bit relocation outside of kernel!\n"); + + *(int32_t *)ptr -= delta; + } for (reloc--; *reloc; reloc--) { long extended = *reloc; extended += map; @@ -358,7 +370,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, unsigned char *output, - unsigned long output_len) + unsigned long output_len, + unsigned long run_size) { real_mode = rmode; @@ -381,8 +394,14 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; - output = choose_kernel_location(input_data, input_len, - output, output_len); + /* + * The memory hole needed for the kernel is the larger of either + * the entire decompressed kernel plus relocation table, or the + * entire decompressed kernel plus .bss and .brk sections. + */ + output = choose_kernel_location(input_data, input_len, output, + output_len > run_size ? output_len + : run_size); /* Validate memory location choices. */ if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index b669ab65bf6..d8222f21318 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -36,11 +36,13 @@ int main(int argc, char *argv[]) uint32_t olen; long ilen; unsigned long offs; + unsigned long run_size; FILE *f = NULL; int retval = 1; - if (argc < 2) { - fprintf(stderr, "Usage: %s compressed_file\n", argv[0]); + if (argc < 3) { + fprintf(stderr, "Usage: %s compressed_file run_size\n", + argv[0]); goto bail; } @@ -74,6 +76,7 @@ int main(int argc, char *argv[]) offs += olen >> 12; /* Add 8 bytes for each 32K block */ offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */ offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ + run_size = atoi(argv[2]); printf(".section \".rodata..compressed\",\"a\",@progbits\n"); printf(".globl z_input_len\n"); @@ -85,6 +88,8 @@ int main(int argc, char *argv[]) /* z_extract_offset_negative allows simplification of head_32.S */ printf(".globl z_extract_offset_negative\n"); printf("z_extract_offset_negative = -0x%lx\n", offs); + printf(".globl z_run_size\n"); + printf("z_run_size = %lu\n", run_size); printf(".globl input_data, input_data_end\n"); printf("input_data:\n"); diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 32d2e7056c8..419819d6dab 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -8,6 +8,7 @@ CONFIG_TASKSTATS=y CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_FHANDLE=y CONFIG_AUDIT=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index a481dd4755d..4c311ddd973 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -7,6 +7,7 @@ CONFIG_TASKSTATS=y CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_FHANDLE=y CONFIG_AUDIT=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 8ffba18395c..ffe71228fc1 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -157,7 +157,7 @@ ENTRY(ia32_sysenter_target) * ourselves. To save a few cycles, we can check whether * NT was set instead of doing an unconditional popfq. */ - testl $X86_EFLAGS_NT,EFLAGS(%rsp) /* saved EFLAGS match cpu */ + testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp) jnz sysenter_fix_flags sysenter_flags_fixed: diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 9863ee3747d..47c8e32f621 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -5,65 +5,6 @@ #include <asm-generic/cacheflush.h> #include <asm/special_insns.h> -#ifdef CONFIG_X86_PAT -/* - * X86 PAT uses page flags WC and Uncached together to keep track of - * memory type of pages that have backing page struct. X86 PAT supports 3 - * different memory types, _PAGE_CACHE_WB, _PAGE_CACHE_WC and - * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not - * been changed from its default (value of -1 used to denote this). - * Note we do not support _PAGE_CACHE_UC here. - */ - -#define _PGMT_DEFAULT 0 -#define _PGMT_WC (1UL << PG_arch_1) -#define _PGMT_UC_MINUS (1UL << PG_uncached) -#define _PGMT_WB (1UL << PG_uncached | 1UL << PG_arch_1) -#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) -#define _PGMT_CLEAR_MASK (~_PGMT_MASK) - -static inline unsigned long get_page_memtype(struct page *pg) -{ - unsigned long pg_flags = pg->flags & _PGMT_MASK; - - if (pg_flags == _PGMT_DEFAULT) - return -1; - else if (pg_flags == _PGMT_WC) - return _PAGE_CACHE_WC; - else if (pg_flags == _PGMT_UC_MINUS) - return _PAGE_CACHE_UC_MINUS; - else - return _PAGE_CACHE_WB; -} - -static inline void set_page_memtype(struct page *pg, unsigned long memtype) -{ - unsigned long memtype_flags = _PGMT_DEFAULT; - unsigned long old_flags; - unsigned long new_flags; - - switch (memtype) { - case _PAGE_CACHE_WC: - memtype_flags = _PGMT_WC; - break; - case _PAGE_CACHE_UC_MINUS: - memtype_flags = _PGMT_UC_MINUS; - break; - case _PAGE_CACHE_WB: - memtype_flags = _PGMT_WB; - break; - } - - do { - old_flags = pg->flags; - new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; - } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags); -} -#else -static inline unsigned long get_page_memtype(struct page *pg) { return -1; } -static inline void set_page_memtype(struct page *pg, unsigned long memtype) { } -#endif - /* * The set_memory_* API can be used to change various attributes of a virtual * address range. The attributes include: diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 97534a7d38e..f226df06466 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -10,6 +10,12 @@ * cpu_feature_enabled(). */ +#ifdef CONFIG_X86_INTEL_MPX +# define DISABLE_MPX 0 +#else +# define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) +#endif + #ifdef CONFIG_X86_64 # define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) @@ -34,6 +40,6 @@ #define DISABLED_MASK6 0 #define DISABLED_MASK7 0 #define DISABLED_MASK8 0 -#define DISABLED_MASK9 0 +#define DISABLED_MASK9 (DISABLE_MPX) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 0ec241ede5a..25bce45c6fc 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -81,24 +81,23 @@ extern u64 asmlinkage efi_call(void *fp, ...); */ #define __efi_call_virt(f, args...) efi_call_virt(f, args) -extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, - u32 type, u64 attribute); +extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, + u32 type, u64 attribute); #endif /* CONFIG_X86_32 */ -extern int add_efi_memmap; extern struct efi_scratch efi_scratch; -extern void efi_set_executable(efi_memory_desc_t *md, bool executable); -extern int efi_memblock_x86_reserve_range(void); -extern void efi_call_phys_prelog(void); -extern void efi_call_phys_epilog(void); -extern void efi_unmap_memmap(void); -extern void efi_memory_uc(u64 addr, unsigned long size); +extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable); +extern int __init efi_memblock_x86_reserve_range(void); +extern void __init efi_call_phys_prolog(void); +extern void __init efi_call_phys_epilog(void); +extern void __init efi_unmap_memmap(void); +extern void __init efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); extern void __init efi_map_region_fixed(efi_memory_desc_t *md); extern void efi_sync_low_kernel_mappings(void); -extern int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages); -extern void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages); +extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages); +extern void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages); extern void __init old_map_region(efi_memory_desc_t *md); extern void __init runtime_code_page_mkexec(void); extern void __init efi_runtime_mkexec(void); @@ -159,19 +158,33 @@ static inline efi_status_t efi_thunk_set_virtual_address_map( } #endif /* CONFIG_EFI_MIXED */ + +/* arch specific definitions used by the stub code */ + +struct efi_config { + u64 image_handle; + u64 table; + u64 allocate_pool; + u64 allocate_pages; + u64 get_memory_map; + u64 free_pool; + u64 free_pages; + u64 locate_handle; + u64 handle_protocol; + u64 exit_boot_services; + u64 text_output; + efi_status_t (*call)(unsigned long, ...); + bool is64; +} __packed; + +__pure const struct efi_config *__efi_early(void); + +#define efi_call_early(f, ...) \ + __efi_early()->call(__efi_early()->f, __VA_ARGS__); + extern bool efi_reboot_required(void); #else -/* - * IF EFI is not configured, have the EFI calls return -ENOSYS. - */ -#define efi_call0(_f) (-ENOSYS) -#define efi_call1(_f, _a1) (-ENOSYS) -#define efi_call2(_f, _a1, _a2) (-ENOSYS) -#define efi_call3(_f, _a1, _a2, _a3) (-ENOSYS) -#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS) -#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS) -#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS) static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {} static inline bool efi_reboot_required(void) { diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h index 2519d0679d9..c3dd5e71f43 100644 --- a/arch/x86/include/asm/fb.h +++ b/arch/x86/include/asm/fb.h @@ -8,8 +8,12 @@ static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma, unsigned long off) { + unsigned long prot; + + prot = pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK; if (boot_cpu_data.x86 > 3) - pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; + pgprot_val(vma->vm_page_prot) = + prot | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS); } extern int fb_is_primary_device(struct fb_info *info); diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index ffb1733ac91..bf728e49c53 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -136,9 +136,7 @@ enum fixed_addresses { extern void reserve_top_address(unsigned long reserve); #define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) -#define FIXADDR_BOOT_START (FIXADDR_TOP - FIXADDR_BOOT_SIZE) extern int fixmaps_set; diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 302a323b3f6..04e9d023168 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -38,17 +38,20 @@ extern unsigned long highstart_pfn, highend_pfn; /* * Ordering is: * - * FIXADDR_TOP - * fixed_addresses - * FIXADDR_START - * temp fixed addresses - * FIXADDR_BOOT_START - * Persistent kmap area - * PKMAP_BASE - * VMALLOC_END - * Vmalloc area - * VMALLOC_START - * high_memory + * high memory on: high_memory off: + * FIXADDR_TOP FIXADDR_TOP + * fixed addresses fixed addresses + * FIXADDR_START FIXADDR_START + * temp fixed addresses/persistent kmap area VMALLOC_END + * PKMAP_BASE temp fixed addresses/vmalloc area + * VMALLOC_END VMALLOC_START + * vmalloc area high_memory + * VMALLOC_START + * high_memory + * + * The temp fixed area is only used during boot for early_ioremap(), and + * it is unused when the ioremap() is functional. vmalloc/pkmap area become + * available after early boot so the temp fixed area is available for re-use. */ #define LAST_PKMAP_MASK (LAST_PKMAP-1) #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 48eb30a8606..47f29b1d184 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -65,6 +65,7 @@ struct insn { unsigned char x86_64; const insn_byte_t *kaddr; /* kernel address of insn to analyze */ + const insn_byte_t *end_kaddr; /* kernel address of last insn in buffer */ const insn_byte_t *next_byte; }; @@ -96,7 +97,7 @@ struct insn { #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ -extern void insn_init(struct insn *insn, const void *kaddr, int x86_64); +extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern void insn_get_prefixes(struct insn *insn); extern void insn_get_opcode(struct insn *insn); extern void insn_get_modrm(struct insn *insn); @@ -115,12 +116,13 @@ static inline void insn_get_attribute(struct insn *insn) extern int insn_rip_relative(struct insn *insn); /* Init insn for kernel text */ -static inline void kernel_insn_init(struct insn *insn, const void *kaddr) +static inline void kernel_insn_init(struct insn *insn, + const void *kaddr, int buf_len) { #ifdef CONFIG_X86_64 - insn_init(insn, kaddr, 1); + insn_init(insn, kaddr, buf_len, 1); #else /* CONFIG_X86_32 */ - insn_init(insn, kaddr, 0); + insn_init(insn, kaddr, buf_len, 0); #endif } diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index b8237d8a1e0..34a5b93704d 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -74,6 +74,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define __raw_readw __readw #define __raw_readl __readl +#define writeb_relaxed(v, a) __writeb(v, a) +#define writew_relaxed(v, a) __writew(v, a) +#define writel_relaxed(v, a) __writel(v, a) #define __raw_writeb __writeb #define __raw_writew __writew #define __raw_writel __writel @@ -86,6 +89,7 @@ build_mmio_read(readq, "q", unsigned long, "=r", :"memory") build_mmio_write(writeq, "q", unsigned long, "r", :"memory") #define readq_relaxed(a) readq(a) +#define writeq_relaxed(v, a) writeq(v, a) #define __raw_readq(a) readq(a) #define __raw_writeq(val, addr) writeq(val, addr) @@ -310,11 +314,11 @@ BUILDIO(b, b, char) BUILDIO(w, w, short) BUILDIO(l, , int) -extern void *xlate_dev_mem_ptr(unsigned long phys); -extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); +extern void *xlate_dev_mem_ptr(phys_addr_t phys); +extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, - unsigned long prot_val); + enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); extern bool is_early_ioremap_ptep(pte_t *ptep); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7d603a71ab3..6ed0c30d6a0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -989,6 +989,20 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); } +static inline u64 get_canonical(u64 la) +{ + return ((int64_t)la << 16) >> 16; +} + +static inline bool is_noncanonical_address(u64 la) +{ +#ifdef CONFIG_X86_64 + return get_canonical(la) != la; +#else + return false; +#endif +} + #define TSS_IOPB_BASE_OFFSET 0x66 #define TSS_BASE_SIZE 0x68 #define TSS_IOPB_SIZE (65536 / 8) @@ -1050,7 +1064,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, unsigned long address); void kvm_define_shared_msr(unsigned index, u32 msr); -void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 166af2a8e86..40269a2bf6f 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -10,9 +10,8 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/paravirt.h> +#include <asm/mpx.h> #ifndef CONFIG_PARAVIRT -#include <asm-generic/mm_hooks.h> - static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) { @@ -53,7 +52,16 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); - /* Load the LDT, if the LDT is different: */ + /* + * Load the LDT, if the LDT is different. + * + * It's possible leave_mm(prev) has been called. If so, + * then prev->context.ldt could be out of sync with the + * LDT descriptor or the LDT register. This can only happen + * if prev->context.ldt is non-null, since we never free + * an LDT. But LDTs can't be shared across mms, so + * prev->context.ldt won't be equal to next->context.ldt. + */ if (unlikely(prev->context.ldt != next->context.ldt)) load_LDT_nolock(&next->context); } @@ -102,4 +110,27 @@ do { \ } while (0) #endif +static inline void arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ + paravirt_arch_dup_mmap(oldmm, mm); +} + +static inline void arch_exit_mmap(struct mm_struct *mm) +{ + paravirt_arch_exit_mmap(mm); +} + +static inline void arch_bprm_mm_init(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + mpx_mm_init(mm); +} + +static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + mpx_notify_unmap(mm, vma, start, end); +} + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h new file mode 100644 index 00000000000..a952a13d59a --- /dev/null +++ b/arch/x86/include/asm/mpx.h @@ -0,0 +1,103 @@ +#ifndef _ASM_X86_MPX_H +#define _ASM_X86_MPX_H + +#include <linux/types.h> +#include <asm/ptrace.h> +#include <asm/insn.h> + +/* + * NULL is theoretically a valid place to put the bounds + * directory, so point this at an invalid address. + */ +#define MPX_INVALID_BOUNDS_DIR ((void __user *)-1) +#define MPX_BNDCFG_ENABLE_FLAG 0x1 +#define MPX_BD_ENTRY_VALID_FLAG 0x1 + +#ifdef CONFIG_X86_64 + +/* upper 28 bits [47:20] of the virtual address in 64-bit used to + * index into bounds directory (BD). + */ +#define MPX_BD_ENTRY_OFFSET 28 +#define MPX_BD_ENTRY_SHIFT 3 +/* bits [19:3] of the virtual address in 64-bit used to index into + * bounds table (BT). + */ +#define MPX_BT_ENTRY_OFFSET 17 +#define MPX_BT_ENTRY_SHIFT 5 +#define MPX_IGN_BITS 3 +#define MPX_BD_ENTRY_TAIL 3 + +#else + +#define MPX_BD_ENTRY_OFFSET 20 +#define MPX_BD_ENTRY_SHIFT 2 +#define MPX_BT_ENTRY_OFFSET 10 +#define MPX_BT_ENTRY_SHIFT 4 +#define MPX_IGN_BITS 2 +#define MPX_BD_ENTRY_TAIL 2 + +#endif + +#define MPX_BD_SIZE_BYTES (1UL<<(MPX_BD_ENTRY_OFFSET+MPX_BD_ENTRY_SHIFT)) +#define MPX_BT_SIZE_BYTES (1UL<<(MPX_BT_ENTRY_OFFSET+MPX_BT_ENTRY_SHIFT)) + +#define MPX_BNDSTA_TAIL 2 +#define MPX_BNDCFG_TAIL 12 +#define MPX_BNDSTA_ADDR_MASK (~((1UL<<MPX_BNDSTA_TAIL)-1)) +#define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1)) +#define MPX_BT_ADDR_MASK (~((1UL<<MPX_BD_ENTRY_TAIL)-1)) + +#define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1)) +#define MPX_BNDSTA_ERROR_CODE 0x3 + +#define MPX_BD_ENTRY_MASK ((1<<MPX_BD_ENTRY_OFFSET)-1) +#define MPX_BT_ENTRY_MASK ((1<<MPX_BT_ENTRY_OFFSET)-1) +#define MPX_GET_BD_ENTRY_OFFSET(addr) ((((addr)>>(MPX_BT_ENTRY_OFFSET+ \ + MPX_IGN_BITS)) & MPX_BD_ENTRY_MASK) << MPX_BD_ENTRY_SHIFT) +#define MPX_GET_BT_ENTRY_OFFSET(addr) ((((addr)>>MPX_IGN_BITS) & \ + MPX_BT_ENTRY_MASK) << MPX_BT_ENTRY_SHIFT) + +#ifdef CONFIG_X86_INTEL_MPX +siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, + struct xsave_struct *xsave_buf); +int mpx_handle_bd_fault(struct xsave_struct *xsave_buf); +static inline int kernel_managing_mpx_tables(struct mm_struct *mm) +{ + return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR); +} +static inline void mpx_mm_init(struct mm_struct *mm) +{ + /* + * NULL is theoretically a valid place to put the bounds + * directory, so point this at an invalid address. + */ + mm->bd_addr = MPX_INVALID_BOUNDS_DIR; +} +void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start, unsigned long end); +#else +static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, + struct xsave_struct *xsave_buf) +{ + return NULL; +} +static inline int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) +{ + return -EINVAL; +} +static inline int kernel_managing_mpx_tables(struct mm_struct *mm) +{ + return 0; +} +static inline void mpx_mm_init(struct mm_struct *mm) +{ +} +static inline void mpx_notify_unmap(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ +} +#endif /* CONFIG_X86_INTEL_MPX */ + +#endif /* _ASM_X86_MPX_H */ diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index f48b17df422..3a52ee0e726 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -20,7 +20,6 @@ #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define STACKFAULT_STACK 0 #define DOUBLEFAULT_STACK 1 #define NMI_STACK 0 #define DEBUG_STACK 0 diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 678205195ae..75450b2c7be 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -14,12 +14,11 @@ #define IRQ_STACK_ORDER 2 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) -#define STACKFAULT_STACK 1 -#define DOUBLEFAULT_STACK 2 -#define NMI_STACK 3 -#define DEBUG_STACK 4 -#define MCE_STACK 5 -#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ +#define DOUBLEFAULT_STACK 1 +#define NMI_STACK 2 +#define DEBUG_STACK 3 +#define MCE_STACK 4 +#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cd6e1610e29..32444ae939c 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -330,13 +330,13 @@ static inline void paravirt_activate_mm(struct mm_struct *prev, PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next); } -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) +static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) { PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm); } -static inline void arch_exit_mmap(struct mm_struct *mm) +static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm); } @@ -986,5 +986,15 @@ extern void default_banner(void); #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop +#ifndef __ASSEMBLY__ +static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ +} + +static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) +{ +} +#endif /* __ASSEMBLY__ */ #endif /* !CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_H */ diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index e2c1668dde7..91bc4ba95f9 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -11,16 +11,17 @@ static const int pat_enabled; #endif extern void pat_init(void); +void pat_init_cache_modes(void); extern int reserve_memtype(u64 start, u64 end, - unsigned long req_type, unsigned long *ret_type); + enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); extern int free_memtype(u64 start, u64 end); extern int kernel_map_sync_memtype(u64 base, unsigned long size, - unsigned long flag); + enum page_cache_mode pcm); int io_reserve_memtype(resource_size_t start, resource_size_t end, - unsigned long *type); + enum page_cache_mode *pcm); void io_free_memtype(resource_size_t start, resource_size_t end); diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index fd472181a1d..e0ba66ca68c 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -64,7 +64,7 @@ #define __percpu_prefix "" #endif -#define __percpu_arg(x) __percpu_prefix "%P" #x +#define __percpu_arg(x) __percpu_prefix "%" #x /* * Initialized pointers to per-cpu variables needed for the boot @@ -179,29 +179,58 @@ do { \ } \ } while (0) -#define percpu_from_op(op, var, constraint) \ +#define percpu_from_op(op, var) \ ({ \ typeof(var) pfo_ret__; \ switch (sizeof(var)) { \ case 1: \ asm(op "b "__percpu_arg(1)",%0" \ : "=q" (pfo_ret__) \ - : constraint); \ + : "m" (var)); \ break; \ case 2: \ asm(op "w "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ - : constraint); \ + : "m" (var)); \ break; \ case 4: \ asm(op "l "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ - : constraint); \ + : "m" (var)); \ break; \ case 8: \ asm(op "q "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ - : constraint); \ + : "m" (var)); \ + break; \ + default: __bad_percpu_size(); \ + } \ + pfo_ret__; \ +}) + +#define percpu_stable_op(op, var) \ +({ \ + typeof(var) pfo_ret__; \ + switch (sizeof(var)) { \ + case 1: \ + asm(op "b "__percpu_arg(P1)",%0" \ + : "=q" (pfo_ret__) \ + : "p" (&(var))); \ + break; \ + case 2: \ + asm(op "w "__percpu_arg(P1)",%0" \ + : "=r" (pfo_ret__) \ + : "p" (&(var))); \ + break; \ + case 4: \ + asm(op "l "__percpu_arg(P1)",%0" \ + : "=r" (pfo_ret__) \ + : "p" (&(var))); \ + break; \ + case 8: \ + asm(op "q "__percpu_arg(P1)",%0" \ + : "=r" (pfo_ret__) \ + : "p" (&(var))); \ break; \ default: __bad_percpu_size(); \ } \ @@ -359,11 +388,11 @@ do { \ * per-thread variables implemented as per-cpu variables and thus * stable for the duration of the respective task. */ -#define this_cpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) +#define this_cpu_read_stable(var) percpu_stable_op("mov", var) -#define raw_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define raw_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define raw_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_read_1(pcp) percpu_from_op("mov", pcp) +#define raw_cpu_read_2(pcp) percpu_from_op("mov", pcp) +#define raw_cpu_read_4(pcp) percpu_from_op("mov", pcp) #define raw_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) #define raw_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) @@ -381,9 +410,9 @@ do { \ #define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) #define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) -#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_read_1(pcp) percpu_from_op("mov", pcp) +#define this_cpu_read_2(pcp) percpu_from_op("mov", pcp) +#define this_cpu_read_4(pcp) percpu_from_op("mov", pcp) #define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) #define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) #define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) @@ -435,7 +464,7 @@ do { \ * 32 bit must fall back to generic operations. */ #ifdef CONFIG_X86_64 -#define raw_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_read_8(pcp) percpu_from_op("mov", pcp) #define raw_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) #define raw_cpu_add_8(pcp, val) percpu_add_op((pcp), val) #define raw_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) @@ -444,7 +473,7 @@ do { \ #define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) #define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_read_8(pcp) percpu_from_op("mov", pcp) #define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) #define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) #define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) @@ -522,7 +551,7 @@ static inline int x86_this_cpu_variable_test_bit(int nr, #include <asm-generic/percpu.h> /* We can use this directly for local CPU (faster). */ -DECLARE_PER_CPU(unsigned long, this_cpu_off); +DECLARE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8dfc9fd094a..dc0f6ed35b0 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -177,6 +177,9 @@ struct x86_pmu_capability { #define IBS_CAPS_BRNTRGT (1U<<5) #define IBS_CAPS_OPCNTEXT (1U<<6) #define IBS_CAPS_RIPINVALIDCHK (1U<<7) +#define IBS_CAPS_OPBRNFUSE (1U<<8) +#define IBS_CAPS_FETCHCTLEXTD (1U<<9) +#define IBS_CAPS_OPDATA4 (1U<<10) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index aa97a070f09..c112ea63f40 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -9,9 +9,10 @@ /* * Macro to mark a page protection value as UC- */ -#define pgprot_noncached(prot) \ - ((boot_cpu_data.x86 > 3) \ - ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \ +#define pgprot_noncached(prot) \ + ((boot_cpu_data.x86 > 3) \ + ? (__pgprot(pgprot_val(prot) | \ + cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ : (prot)) #ifndef __ASSEMBLY__ @@ -404,8 +405,8 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, - unsigned long flags, - unsigned long new_flags) + enum page_cache_mode pcm, + enum page_cache_mode new_pcm) { /* * PAT type is always WB for untracked ranges, so no need to check. @@ -419,10 +420,10 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, * - request is uncached, return cannot be write-back * - request is write-combine, return cannot be write-back */ - if ((flags == _PAGE_CACHE_UC_MINUS && - new_flags == _PAGE_CACHE_WB) || - (flags == _PAGE_CACHE_WC && - new_flags == _PAGE_CACHE_WB)) { + if ((pcm == _PAGE_CACHE_MODE_UC_MINUS && + new_pcm == _PAGE_CACHE_MODE_WB) || + (pcm == _PAGE_CACHE_MODE_WC && + new_pcm == _PAGE_CACHE_MODE_WB)) { return 0; } diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index ed5903be26f..9fb2f2bc824 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -37,7 +37,7 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ #define LAST_PKMAP 1024 #endif -#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ +#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ & PMD_MASK) #ifdef CONFIG_HIGHMEM diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 7166e25ecb5..602b6028c5b 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -63,6 +63,8 @@ typedef struct { pteval_t pte; } pte_t; #define MODULES_LEN (MODULES_END - MODULES_VADDR) #define ESPFIX_PGD_ENTRY _AC(-2, UL) #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) +#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) +#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) #define EARLY_DYNAMIC_PAGE_TABLES 64 diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 07789647bf3..af447f95e3b 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -128,11 +128,28 @@ _PAGE_SOFT_DIRTY | _PAGE_NUMA) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) -#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) -#define _PAGE_CACHE_WB (0) -#define _PAGE_CACHE_WC (_PAGE_PWT) -#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD) -#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT) +/* + * The cache modes defined here are used to translate between pure SW usage + * and the HW defined cache mode bits and/or PAT entries. + * + * The resulting bits for PWT, PCD and PAT should be chosen in a way + * to have the WB mode at index 0 (all bits clear). This is the default + * right now and likely would break too much if changed. + */ +#ifndef __ASSEMBLY__ +enum page_cache_mode { + _PAGE_CACHE_MODE_WB = 0, + _PAGE_CACHE_MODE_WC = 1, + _PAGE_CACHE_MODE_UC_MINUS = 2, + _PAGE_CACHE_MODE_UC = 3, + _PAGE_CACHE_MODE_WT = 4, + _PAGE_CACHE_MODE_WP = 5, + _PAGE_CACHE_MODE_NUM = 8 +}; +#endif + +#define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) +#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ @@ -156,41 +173,27 @@ #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) -#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT) -#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC) -#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) -#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_NOCACHE) #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) -#define __PAGE_KERNEL_VVAR_NOCACHE (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) -#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) #define __PAGE_KERNEL_IO (__PAGE_KERNEL) #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) -#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS) -#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC) #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) #define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) -#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC) #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS) -#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE) #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) -#define PAGE_KERNEL_VVAR_NOCACHE __pgprot(__PAGE_KERNEL_VVAR_NOCACHE) #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) -#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS) -#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC) /* xwr */ #define __P000 PAGE_NONE @@ -341,6 +344,59 @@ static inline pmdval_t pmdnuma_flags(pmd_t pmd) #define pgprot_val(x) ((x).pgprot) #define __pgprot(x) ((pgprot_t) { (x) } ) +extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM]; +extern uint8_t __pte2cachemode_tbl[8]; + +#define __pte2cm_idx(cb) \ + ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \ + (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \ + (((cb) >> _PAGE_BIT_PWT) & 1)) +#define __cm_idx2pte(i) \ + ((((i) & 4) << (_PAGE_BIT_PAT - 2)) | \ + (((i) & 2) << (_PAGE_BIT_PCD - 1)) | \ + (((i) & 1) << _PAGE_BIT_PWT)) + +static inline unsigned long cachemode2protval(enum page_cache_mode pcm) +{ + if (likely(pcm == 0)) + return 0; + return __cachemode2pte_tbl[pcm]; +} +static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) +{ + return __pgprot(cachemode2protval(pcm)); +} +static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) +{ + unsigned long masked; + + masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; + if (likely(masked == 0)) + return 0; + return __pte2cachemode_tbl[__pte2cm_idx(masked)]; +} +static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) +{ + pgprot_t new; + unsigned long val; + + val = pgprot_val(pgprot); + pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | + ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); + return new; +} +static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot) +{ + pgprot_t new; + unsigned long val; + + val = pgprot_val(pgprot); + pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | + ((val & _PAGE_PAT_LARGE) >> + (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); + return new; +} + typedef struct page *pgtable_t; diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 7024c12f7bf..8f327184253 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -30,9 +30,6 @@ static __always_inline void preempt_count_set(int pc) /* * must be macros to avoid header recursion hell */ -#define task_preempt_count(p) \ - (task_thread_info(p)->saved_preempt_count & ~PREEMPT_NEED_RESCHED) - #define init_task_preempt_count(p) do { \ task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ } while (0) @@ -105,6 +102,7 @@ static __always_inline bool should_resched(void) # ifdef CONFIG_CONTEXT_TRACKING extern asmlinkage void ___preempt_schedule_context(void); # define __preempt_schedule_context() asm ("call ___preempt_schedule_context") + extern asmlinkage void preempt_schedule_context(void); # endif #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index eb71ec79473..25b8de0f21c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -127,7 +127,7 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; u32 microcode; -} __attribute__((__aligned__(SMP_CACHE_BYTES))); +}; #define X86_VENDOR_INTEL 0 #define X86_VENDOR_CYRIX 1 @@ -151,7 +151,7 @@ extern __u32 cpu_caps_cleared[NCAPINTS]; extern __u32 cpu_caps_set[NCAPINTS]; #ifdef CONFIG_SMP -DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); +DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); #define cpu_data(cpu) per_cpu(cpu_info, cpu) #else #define cpu_info boot_cpu_data @@ -374,13 +374,14 @@ struct lwp_struct { u8 reserved[128]; }; -struct bndregs_struct { - u64 bndregs[8]; +struct bndreg { + u64 lower_bound; + u64 upper_bound; } __packed; -struct bndcsr_struct { - u64 cfg_reg_u; - u64 status_reg; +struct bndcsr { + u64 bndcfgu; + u64 bndstatus; } __packed; struct xsave_hdr_struct { @@ -394,8 +395,8 @@ struct xsave_struct { struct xsave_hdr_struct xsave_hdr; struct ymmh_struct ymmh; struct lwp_struct lwp; - struct bndregs_struct bndregs; - struct bndcsr_struct bndcsr; + struct bndreg bndreg[4]; + struct bndcsr bndcsr; /* new processor state extensions will go here */ } __attribute__ ((packed, aligned (64))); @@ -953,6 +954,24 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); +/* Register/unregister a process' MPX related resource */ +#define MPX_ENABLE_MANAGEMENT(tsk) mpx_enable_management((tsk)) +#define MPX_DISABLE_MANAGEMENT(tsk) mpx_disable_management((tsk)) + +#ifdef CONFIG_X86_INTEL_MPX +extern int mpx_enable_management(struct task_struct *tsk); +extern int mpx_disable_management(struct task_struct *tsk); +#else +static inline int mpx_enable_management(struct task_struct *tsk) +{ + return -EINVAL; +} +static inline int mpx_disable_management(struct task_struct *tsk) +{ + return -EINVAL; +} +#endif /* CONFIG_X86_INTEL_MPX */ + extern u16 amd_get_nb_id(int cpu); static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 8cd27e08e23..8cd1cc3bc83 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -150,6 +150,7 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) } void cpu_disable_common(void); +void cpu_die_common(unsigned int cpu); void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus); diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 9295016485c..a4efe477cea 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -183,8 +183,20 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { - while (arch_spin_is_locked(lock)) + __ticket_t head = ACCESS_ONCE(lock->tickets.head); + + for (;;) { + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + /* + * We need to check "unlocked" in a loop, tmp.head == head + * can be false positive because of overflow. + */ + if (tmp.head == (tmp.tail & ~TICKET_SLOWPATH_FLAG) || + tmp.head != head) + break; + cpu_relax(); + } } /* diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index d7f3b3b78ac..751bf4b7bf1 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -79,12 +79,12 @@ do { \ #else /* CONFIG_X86_32 */ /* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" +#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t" #define __EXTRA_CLOBBER \ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ - "r12", "r13", "r14", "r15" + "r12", "r13", "r14", "r15", "flags" #ifdef CONFIG_CC_STACKPROTECTOR #define __switch_canary \ @@ -100,7 +100,11 @@ do { \ #define __switch_canary_iparam #endif /* CC_STACKPROTECTOR */ -/* Save restore flags to clear handle leaking NT */ +/* + * There is no need to save or restore flags, because flags are always + * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL + * has no effect. + */ #define switch_to(prev, next, last) \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 854053889d4..547e344a6dc 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -141,7 +141,7 @@ struct thread_info { /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \ - _TIF_USER_RETURN_NOTIFY) + _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index bc8352e7010..707adc6549d 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -39,6 +39,7 @@ asmlinkage void simd_coprocessor_error(void); #ifdef CONFIG_TRACING asmlinkage void trace_page_fault(void); +#define trace_stack_segment stack_segment #define trace_divide_error divide_error #define trace_bounds bounds #define trace_invalid_op invalid_op diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 2d60a7813df..fc808b83fcc 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -33,8 +33,8 @@ * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). */ -#define MAX_CPUS_PER_UVHUB 64 -#define MAX_CPUS_PER_SOCKET 32 +#define MAX_CPUS_PER_UVHUB 128 +#define MAX_CPUS_PER_SOCKET 64 #define ADP_SZ 64 /* hardware-provided max. */ #define UV_CPUS_PER_AS 32 /* hardware-provided max. */ #define ITEMS_PER_DESC 8 diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index e45e4da96bf..f58a9c7a3c8 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -172,7 +172,6 @@ struct x86_platform_ops { struct pci_dev; struct msi_msg; -struct msi_desc; struct x86_msi_ops { int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); @@ -183,8 +182,6 @@ struct x86_msi_ops { void (*teardown_msi_irqs)(struct pci_dev *dev); void (*restore_msi_irqs)(struct pci_dev *dev); int (*setup_hpet_msi)(unsigned int irq, unsigned int id); - u32 (*msi_mask_irq)(struct msi_desc *desc, u32 mask, u32 flag); - u32 (*msix_mask_irq)(struct msi_desc *desc, u32 flag); }; struct IO_APIC_route_entry; diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index e21331ce368..8f02f699075 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -206,6 +206,7 @@ #define MSR_AMD64_IBSOP_REG_MASK ((1UL<<MSR_AMD64_IBSOP_REG_COUNT)-1) #define MSR_AMD64_IBSCTL 0xc001103a #define MSR_AMD64_IBSBRTARGET 0xc001103b +#define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ /* Fam 16h MSRs */ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 0e79420376e..990a2fe1588 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -67,6 +67,7 @@ #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_INVEPT 50 #define EXIT_REASON_PREEMPTION_TIMER 52 +#define EXIT_REASON_INVVPID 53 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 @@ -114,6 +115,7 @@ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ { EXIT_REASON_INVD, "INVD" }, \ + { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_INVPCID, "INVPCID" } #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index b436fc735aa..a142e77693e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -397,7 +397,7 @@ static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, /* Don't set up the ACPI SCI because it's already set up */ if (acpi_gbl_FADT.sci_interrupt == gsi) - return gsi; + return mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC); trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; @@ -604,14 +604,18 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) { - int irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC | IOAPIC_MAP_CHECK); + int irq; - if (irq >= 0) { + if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { + *irqp = gsi; + } else { + irq = mp_map_gsi_to_irq(gsi, + IOAPIC_MAP_ALLOC | IOAPIC_MAP_CHECK); + if (irq < 0) + return -1; *irqp = irq; - return 0; } - - return -1; + return 0; } EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index f04dbb3069b..5caed1dd7cc 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -21,6 +21,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, {} @@ -30,6 +31,7 @@ EXPORT_SYMBOL(amd_nb_misc_ids); static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, {} diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 5972b108f15..b708738d016 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -185,8 +185,6 @@ static void apbt_setup_irq(struct apbt_dev *adev) irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); - /* APB timer irqs are set up as mp_irqs, timer is edge type */ - __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge"); } /* Should be called with per cpu */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 00853b254ab..ba6cc041edb 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1297,7 +1297,7 @@ void setup_local_APIC(void) unsigned int value, queued; int i, j, acked = 0; unsigned long long tsc = 0, ntsc; - long long max_loops = cpu_khz; + long long max_loops = cpu_khz ? cpu_khz : 1000000; if (cpu_has_tsc) rdtscll(tsc); @@ -1383,7 +1383,7 @@ void setup_local_APIC(void) break; } if (queued) { - if (cpu_has_tsc) { + if (cpu_has_tsc && cpu_khz) { rdtscll(ntsc); max_loops = (cpu_khz << 10) - (ntsc - tsc); } else diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 4128b5fcb55..c2fd21fed00 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -40,7 +40,7 @@ static unsigned int get_apic_id(unsigned long x) unsigned int id; rdmsrl(MSR_FAM10H_NODE_ID, value); - id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U); + id = ((x >> 24) & 0xffU) | ((value << 2) & 0xff00U); return id; } @@ -145,7 +145,7 @@ static void numachip_send_IPI_all(int vector) static void numachip_send_IPI_self(int vector) { - __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); + apic_write(APIC_SELF_IPI, vector); } static int __init numachip_probe(void) @@ -153,20 +153,8 @@ static int __init numachip_probe(void) return apic == &apic_numachip; } -static void __init map_csrs(void) -{ - printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n", - NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1); - init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); - - printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n", - NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1); - init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE); -} - static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) { - if (c->phys_proc_id != node) { c->phys_proc_id = node; per_cpu(cpu_llc_id, smp_processor_id()) = node; @@ -175,19 +163,15 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) static int __init numachip_system_init(void) { - unsigned int val; - if (!numachip_system) return 0; + init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); + init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE); + x86_cpuinit.fixup_cpu_id = fixup_cpu_id; x86_init.pci.arch_init = pci_numachip_init; - map_csrs(); - - val = read_lcsr(CSR_G0_NODE_IDS); - printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val); - return 0; } early_initcall(numachip_system_init); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1183d545da1..7ffe0a2b870 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3158,7 +3158,7 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); - __write_msi_msg(data->msi_desc, &msg); + __pci_write_msi_msg(data->msi_desc, &msg); return IRQ_SET_MASK_OK_NOCOPY; } @@ -3169,8 +3169,8 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) */ static struct irq_chip msi_chip = { .name = "PCI-MSI", - .irq_unmask = unmask_msi_irq, - .irq_mask = mask_msi_irq, + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, .irq_ack = ack_apic_edge, .irq_set_affinity = msi_set_affinity, .irq_retrigger = ioapic_retrigger_irq, @@ -3196,7 +3196,7 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, * MSI message denotes a contiguous group of IRQs, written for 0th IRQ. */ if (!irq_offset) - write_msi_msg(irq, &msg); + pci_write_msi_msg(irq, &msg); setup_remapped_irq(irq, irq_cfg(irq), chip); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e7c798b354f..4f9359f36bb 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -48,7 +48,6 @@ int main(void) #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) ENTRY(bx); - ENTRY(bx); ENTRY(cx); ENTRY(dx); ENTRY(sp); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 01d5453b550..e27b49d7c92 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -39,9 +39,12 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o endif obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_uncore_snb.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore_snbep.o perf_event_intel_uncore_nhmex.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o + +obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ + perf_event_intel_uncore_snb.o \ + perf_event_intel_uncore_snbep.o \ + perf_event_intel_uncore_nhmex.o endif diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 813d29d00a1..15c5df92f74 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -566,6 +566,17 @@ static void init_amd_k8(struct cpuinfo_x86 *c) if (!c->x86_model_id[0]) strcpy(c->x86_model_id, "Hammer"); + +#ifdef CONFIG_SMP + /* + * Disable TLB flush filter by setting HWCR.FFDIS on K8 + * bit 6 of msr C001_0015 + * + * Errata 63 for SH-B3 steppings + * Errata 122 for all steppings (F+ have it disabled by default) + */ + msr_set_bit(MSR_K7_HWCR, 6); +#endif } static void init_amd_gh(struct cpuinfo_x86 *c) @@ -636,18 +647,6 @@ static void init_amd(struct cpuinfo_x86 *c) { u32 dummy; -#ifdef CONFIG_SMP - /* - * Disable TLB flush filter by setting HWCR.FFDIS on K8 - * bit 6 of msr C001_0015 - * - * Errata 63 for SH-B3 steppings - * Errata 122 for all steppings (F+ have it disabled by default) - */ - if (c->x86 == 0xf) - msr_set_bit(MSR_K7_HWCR, 6); -#endif - early_init_amd(c); /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4b4f78c9ba1..cfa9b5b2c27 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -146,6 +146,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); static int __init x86_xsave_setup(char *s) { + if (strlen(s)) + return 0; setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); setup_clear_cpu_cap(X86_FEATURE_XSAVES); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 1ef45627317..9cc6b6f25f4 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -213,12 +213,13 @@ static void intel_workarounds(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_F00F_BUG /* - * All current models of Pentium and Pentium with MMX technology CPUs + * All models of Pentium and Pentium with MMX technology CPUs * have the F0 0F bug, which lets nonprivileged users lock up the * system. Announce that the fault handler will be checking for it. + * The Quark is also family 5, but does not have the same bug. */ clear_cpu_bug(c, X86_BUG_F00F); - if (!paravirt_enabled() && c->x86 == 5) { + if (!paravirt_enabled() && c->x86 == 5 && c->x86_model < 9) { static int f00f_workaround_enabled; set_cpu_bug(c, X86_BUG_F00F); diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c index 7aa1acc7978..06674473b0e 100644 --- a/arch/x86/kernel/cpu/microcode/amd_early.c +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -108,12 +108,13 @@ static size_t compute_container_size(u8 *data, u32 total_size) * load_microcode_amd() to save equivalent cpu table and microcode patches in * kernel heap memory. */ -static void apply_ucode_in_initrd(void *ucode, size_t size) +static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) { struct equiv_cpu_entry *eq; size_t *cont_sz; u32 *header; u8 *data, **cont; + u8 (*patch)[PATCH_MAX_SIZE]; u16 eq_id = 0; int offset, left; u32 rev, eax, ebx, ecx, edx; @@ -123,10 +124,12 @@ static void apply_ucode_in_initrd(void *ucode, size_t size) new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); cont_sz = (size_t *)__pa_nodebug(&container_size); cont = (u8 **)__pa_nodebug(&container); + patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); #else new_rev = &ucode_new_rev; cont_sz = &container_size; cont = &container; + patch = &amd_ucode_patch; #endif data = ucode; @@ -213,9 +216,9 @@ static void apply_ucode_in_initrd(void *ucode, size_t size) rev = mc->hdr.patch_id; *new_rev = rev; - /* save ucode patch */ - memcpy(amd_ucode_patch, mc, - min_t(u32, header[1], PATCH_MAX_SIZE)); + if (save_patch) + memcpy(patch, mc, + min_t(u32, header[1], PATCH_MAX_SIZE)); } } @@ -246,7 +249,7 @@ void __init load_ucode_amd_bsp(void) *data = cp.data; *size = cp.size; - apply_ucode_in_initrd(cp.data, cp.size); + apply_ucode_in_initrd(cp.data, cp.size, true); } #ifdef CONFIG_X86_32 @@ -263,7 +266,7 @@ void load_ucode_amd_ap(void) size_t *usize; void **ucode; - mc = (struct microcode_amd *)__pa(amd_ucode_patch); + mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { __apply_microcode_amd(mc); return; @@ -275,7 +278,7 @@ void load_ucode_amd_ap(void) if (!*ucode || !*usize) return; - apply_ucode_in_initrd(*ucode, *usize); + apply_ucode_in_initrd(*ucode, *usize, false); } static void __init collect_cpu_sig_on_bsp(void *arg) @@ -339,7 +342,7 @@ void load_ucode_amd_ap(void) * AP has a different equivalence ID than BSP, looks like * mixed-steppings silicon so go through the ucode blob anew. */ - apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size); + apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false); } } #endif @@ -347,7 +350,9 @@ void load_ucode_amd_ap(void) int __init save_microcode_in_initrd_amd(void) { unsigned long cont; + int retval = 0; enum ucode_state ret; + u8 *cont_va; u32 eax; if (!container) @@ -355,13 +360,15 @@ int __init save_microcode_in_initrd_amd(void) #ifdef CONFIG_X86_32 get_bsp_sig(); - cont = (unsigned long)container; + cont = (unsigned long)container; + cont_va = __va(container); #else /* * We need the physical address of the container for both bitness since * boot_params.hdr.ramdisk_image is a physical address. */ - cont = __pa(container); + cont = __pa(container); + cont_va = container; #endif /* @@ -372,6 +379,8 @@ int __init save_microcode_in_initrd_amd(void) if (relocated_ramdisk) container = (u8 *)(__va(relocated_ramdisk) + (cont - boot_params.hdr.ramdisk_image)); + else + container = cont_va; if (ucode_new_rev) pr_info("microcode: updated early to new patch_level=0x%08x\n", @@ -382,7 +391,7 @@ int __init save_microcode_in_initrd_amd(void) ret = load_microcode_amd(eax, container, container_size); if (ret != UCODE_OK) - return -EINVAL; + retval = -EINVAL; /* * This will be freed any msec now, stash patches for the current @@ -391,5 +400,5 @@ int __init save_microcode_in_initrd_amd(void) container = NULL; container_size = 0; - return 0; + return retval; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index dd9d6190b08..08fe6e8a726 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -465,6 +465,16 @@ static void mc_bp_resume(void) if (uci->valid && uci->mc) microcode_ops->apply_microcode(cpu); +#ifdef CONFIG_X86_64 + else if (!uci->mc) + /* + * We might resume and not have applied late microcode but still + * have a newer patch stashed from the early loader. We don't + * have it in uci->mc so we have to load it the same way we're + * applying patches early on the APs. + */ + load_ucode_ap(); +#endif } static struct syscore_ops mc_syscore_ops = { diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index 5f28a64e71e..2c017f242a7 100644 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -124,7 +124,7 @@ void __init load_ucode_bsp(void) static bool check_loader_disabled_ap(void) { #ifdef CONFIG_X86_32 - return __pa_nodebug(dis_ucode_ldr); + return *((bool *)__pa_nodebug(&dis_ucode_ldr)); #else return dis_ucode_ldr; #endif diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1b8299dd3d9..143e5f5dc85 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -243,8 +243,9 @@ static bool check_hw_exists(void) msr_fail: printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); - printk(boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR - "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); + printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n", + boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR, + reg, val_new); return false; } @@ -444,12 +445,6 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.type == PERF_TYPE_RAW) event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; - if (event->attr.sample_period && x86_pmu.limit_period) { - if (x86_pmu.limit_period(event, event->attr.sample_period) > - event->attr.sample_period) - return -EINVAL; - } - return x86_setup_perfctr(event); } @@ -987,9 +982,6 @@ int x86_perf_event_set_period(struct perf_event *event) if (left > x86_pmu.max_period) left = x86_pmu.max_period; - if (x86_pmu.limit_period) - left = x86_pmu.limit_period(event, left); - per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; /* diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index d98a34d435d..4e6cdb0ddc7 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -253,6 +253,10 @@ struct cpu_hw_events { #define INTEL_UEVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) +/* Like UEVENT_CONSTRAINT, but match flags too */ +#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) + #define INTEL_PLD_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) @@ -445,7 +449,6 @@ struct x86_pmu { struct x86_pmu_quirk *quirks; int perfctr_second_write; bool late_ack; - unsigned (*limit_period)(struct perf_event *event, unsigned l); /* * sysfs attrs diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index cbb1be3ed9e..a61f5c6911d 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -565,6 +565,21 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) perf_ibs->offset_max, offset + 1); } while (offset < offset_max); + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + /* + * Read IbsBrTarget and IbsOpData4 separately + * depending on their availability. + * Can't add to offset_max as they are staggered + */ + if (ibs_caps & IBS_CAPS_BRNTRGT) { + rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); + size++; + } + if (ibs_caps & IBS_CAPS_OPDATA4) { + rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++); + size++; + } + } ibs_data.size = sizeof(u64) * size; regs = *iregs; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index a73947c53b6..944bf019b74 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -220,15 +220,6 @@ static struct event_constraint intel_hsw_event_constraints[] = { EVENT_CONSTRAINT_END }; -static struct event_constraint intel_bdw_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ - INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */ - EVENT_CONSTRAINT_END -}; - static u64 intel_pmu_event_map(int hw_event) { return intel_perfmon_event_map[hw_event]; @@ -424,126 +415,6 @@ static __initconst const u64 snb_hw_cache_event_ids }; -static __initconst const u64 hsw_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ - [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ - [ C(RESULT_MISS) ] = 0x0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - /* OFFCORE_RESPONSE:ALL_DATA_RD|ALL_CODE_RD */ - [ C(RESULT_ACCESS) ] = 0x1b7, - /* OFFCORE_RESPONSE:ALL_DATA_RD|ALL_CODE_RD|SUPPLIER_NONE| - L3_MISS|ANY_SNOOP */ - [ C(RESULT_MISS) ] = 0x1b7, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE:ALL_RFO */ - /* OFFCORE_RESPONSE:ALL_RFO|SUPPLIER_NONE|L3_MISS|ANY_SNOOP */ - [ C(RESULT_MISS) ] = 0x1b7, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ - [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ - [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */ - [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static __initconst const u64 hsw_hw_cache_extra_regs - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(LL ) ] = { - [ C(OP_READ) ] = { - /* OFFCORE_RESPONSE:ALL_DATA_RD|ALL_CODE_RD */ - [ C(RESULT_ACCESS) ] = 0x2d5, - /* OFFCORE_RESPONSE:ALL_DATA_RD|ALL_CODE_RD|SUPPLIER_NONE| - L3_MISS|ANY_SNOOP */ - [ C(RESULT_MISS) ] = 0x3fbc0202d5ull, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x122, /* OFFCORE_RESPONSE:ALL_RFO */ - /* OFFCORE_RESPONSE:ALL_RFO|SUPPLIER_NONE|L3_MISS|ANY_SNOOP */ - [ C(RESULT_MISS) ] = 0x3fbc020122ull, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, -}; - static __initconst const u64 westmere_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -2034,24 +1905,6 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) return c; } -/* - * Broadwell: - * The INST_RETIRED.ALL period always needs to have lowest - * 6bits cleared (BDM57). It shall not use a period smaller - * than 100 (BDM11). We combine the two to enforce - * a min-period of 128. - */ -static unsigned bdw_limit_period(struct perf_event *event, unsigned left) -{ - if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == - X86_CONFIG(.event=0xc0, .umask=0x01)) { - if (left < 128) - left = 128; - left &= ~0x3fu; - } - return left; -} - PMU_FORMAT_ATTR(event, "config:0-7" ); PMU_FORMAT_ATTR(umask, "config:8-15" ); PMU_FORMAT_ATTR(edge, "config:18" ); @@ -2692,8 +2545,8 @@ __init int intel_pmu_init(void) case 69: /* 22nm Haswell ULT */ case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ x86_pmu.late_ack = true; - memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); intel_pmu_lbr_init_snb(); @@ -2712,28 +2565,6 @@ __init int intel_pmu_init(void) pr_cont("Haswell events, "); break; - case 61: /* 14nm Broadwell Core-M */ - x86_pmu.late_ack = true; - memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); - - intel_pmu_lbr_init_snb(); - - x86_pmu.event_constraints = intel_bdw_event_constraints; - x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; - x86_pmu.extra_regs = intel_snbep_extra_regs; - x86_pmu.pebs_aliases = intel_pebs_aliases_snb; - /* all extra regs are per-cpu when HT is on */ - x86_pmu.er_flags |= ERF_HAS_RSP_1; - x86_pmu.er_flags |= ERF_NO_HT_SHARING; - - x86_pmu.hw_config = hsw_hw_config; - x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.cpu_events = hsw_events_attrs; - x86_pmu.limit_period = bdw_limit_period; - pr_cont("Broadwell events, "); - break; - default: switch (x86_pmu.version) { case 1: diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 46211bcc813..3c895d480cd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -552,18 +552,18 @@ int intel_pmu_drain_bts_buffer(void) * PEBS */ struct event_constraint intel_core2_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ - INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ - INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ - INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ - INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ EVENT_CONSTRAINT_END }; struct event_constraint intel_atom_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ - INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ - INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ EVENT_CONSTRAINT_END }; @@ -577,36 +577,36 @@ struct event_constraint intel_slm_pebs_event_constraints[] = { struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ - INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ - INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ - INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ EVENT_CONSTRAINT_END }; struct event_constraint intel_westmere_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ - INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ - INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ EVENT_CONSTRAINT_END }; struct event_constraint intel_snb_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ @@ -617,7 +617,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { }; struct event_constraint intel_ivb_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ @@ -628,7 +628,7 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = { }; struct event_constraint intel_hsw_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), @@ -724,6 +724,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) unsigned long ip = regs->ip; int is_64bit = 0; void *kaddr; + int size; /* * We don't need to fixup if the PEBS assist is fault like @@ -758,11 +759,12 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 1; } + size = ip - to; if (!kernel_ip(ip)) { - int size, bytes; + int bytes; u8 *buf = this_cpu_read(insn_buffer); - size = ip - to; /* Must fit our buffer, see above */ + /* 'size' must fit our buffer, see above */ bytes = copy_from_user_nmi(buf, (void __user *)to, size); if (bytes != 0) return 0; @@ -780,11 +782,20 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) #ifdef CONFIG_X86_64 is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); #endif - insn_init(&insn, kaddr, is_64bit); + insn_init(&insn, kaddr, size, is_64bit); insn_get_length(&insn); + /* + * Make sure there was not a problem decoding the + * instruction and getting the length. This is + * doubly important because we have an infinite + * loop if insn.length=0. + */ + if (!insn.length) + break; to += insn.length; kaddr += insn.length; + size -= insn.length; } while (to < ip); if (to == ip) { @@ -886,6 +897,29 @@ static void __intel_pmu_pebs_event(struct perf_event *event, regs.bp = pebs->bp; regs.sp = pebs->sp; + if (sample_type & PERF_SAMPLE_REGS_INTR) { + regs.ax = pebs->ax; + regs.bx = pebs->bx; + regs.cx = pebs->cx; + regs.dx = pebs->dx; + regs.si = pebs->si; + regs.di = pebs->di; + regs.bp = pebs->bp; + regs.sp = pebs->sp; + + regs.flags = pebs->flags; +#ifndef CONFIG_X86_32 + regs.r8 = pebs->r8; + regs.r9 = pebs->r9; + regs.r10 = pebs->r10; + regs.r11 = pebs->r11; + regs.r12 = pebs->r12; + regs.r13 = pebs->r13; + regs.r14 = pebs->r14; + regs.r15 = pebs->r15; +#endif + } + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { regs.ip = pebs->real_ip; regs.flags |= PERF_EFLAGS_EXACT; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 45fa730a528..58f1a94beaf 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -465,7 +465,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) { struct insn insn; void *addr; - int bytes, size = MAX_INSN_SIZE; + int bytes_read, bytes_left; int ret = X86_BR_NONE; int ext, to_plm, from_plm; u8 buf[MAX_INSN_SIZE]; @@ -493,8 +493,10 @@ static int branch_type(unsigned long from, unsigned long to, int abort) return X86_BR_NONE; /* may fail if text not present */ - bytes = copy_from_user_nmi(buf, (void __user *)from, size); - if (bytes != 0) + bytes_left = copy_from_user_nmi(buf, (void __user *)from, + MAX_INSN_SIZE); + bytes_read = MAX_INSN_SIZE - bytes_left; + if (!bytes_read) return X86_BR_NONE; addr = buf; @@ -505,10 +507,19 @@ static int branch_type(unsigned long from, unsigned long to, int abort) * Ensure we don't blindy read any address by validating it is * a known text address. */ - if (kernel_text_address(from)) + if (kernel_text_address(from)) { addr = (void *)from; - else + /* + * Assume we can get the maximum possible size + * when grabbing kernel data. This is not + * _strictly_ true since we could possibly be + * executing up next to a memory hole, but + * it is very unlikely to be a problem. + */ + bytes_read = MAX_INSN_SIZE; + } else { return X86_BR_NONE; + } } /* @@ -518,8 +529,10 @@ static int branch_type(unsigned long from, unsigned long to, int abort) #ifdef CONFIG_X86_64 is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); #endif - insn_init(&insn, addr, is64); + insn_init(&insn, addr, bytes_read, is64); insn_get_opcode(&insn); + if (!insn.opcode.got) + return X86_BR_ABORT; switch (insn.opcode.bytes[0]) { case 0xf: diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index adf138eac85..745b158e9a6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -449,7 +449,11 @@ static struct attribute *snbep_uncore_qpi_formats_attr[] = { static struct uncore_event_desc snbep_uncore_imc_events[] = { INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"), { /* end: all zeroes */ }, }; @@ -486,14 +490,17 @@ static struct attribute_group snbep_uncore_qpi_format_group = { .attrs = snbep_uncore_qpi_formats_attr, }; -#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \ - .init_box = snbep_uncore_msr_init_box, \ +#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \ .disable_box = snbep_uncore_msr_disable_box, \ .enable_box = snbep_uncore_msr_enable_box, \ .disable_event = snbep_uncore_msr_disable_event, \ .enable_event = snbep_uncore_msr_enable_event, \ .read_counter = uncore_msr_read_counter +#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \ + __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), \ + .init_box = snbep_uncore_msr_init_box \ + static struct intel_uncore_ops snbep_uncore_msr_ops = { SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), }; @@ -1919,6 +1926,30 @@ static struct intel_uncore_type hswep_uncore_cbox = { .format_group = &hswep_uncore_cbox_format_group, }; +/* + * Write SBOX Initialization register bit by bit to avoid spurious #GPs + */ +static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + + if (msr) { + u64 init = SNBEP_PMON_BOX_CTL_INT; + u64 flags = 0; + int i; + + for_each_set_bit(i, (unsigned long *)&init, 64) { + flags |= (1ULL << i); + wrmsrl(msr, flags); + } + } +} + +static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = { + __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), + .init_box = hswep_uncore_sbox_msr_init_box +}; + static struct attribute *hswep_uncore_sbox_formats_attr[] = { &format_attr_event.attr, &format_attr_umask.attr, @@ -1944,7 +1975,7 @@ static struct intel_uncore_type hswep_uncore_sbox = { .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK, .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL, .msr_offset = HSWEP_SBOX_MSR_OFFSET, - .ops = &snbep_uncore_msr_ops, + .ops = &hswep_uncore_sbox_msr_ops, .format_group = &hswep_uncore_sbox_format_group, }; @@ -2009,7 +2040,11 @@ static struct intel_uncore_type hswep_uncore_ha = { static struct uncore_event_desc hswep_uncore_imc_events[] = { INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x00,umask=0x00"), INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"), { /* end: all zeroes */ }, }; @@ -2025,13 +2060,27 @@ static struct intel_uncore_type hswep_uncore_imc = { SNBEP_UNCORE_PCI_COMMON_INIT(), }; +static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8}; + +static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count = 0; + + pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count); + pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1); + + return count; +} + static struct intel_uncore_ops hswep_uncore_irp_ops = { .init_box = snbep_uncore_pci_init_box, .disable_box = snbep_uncore_pci_disable_box, .enable_box = snbep_uncore_pci_enable_box, .disable_event = ivbep_uncore_irp_disable_event, .enable_event = ivbep_uncore_irp_enable_event, - .read_counter = ivbep_uncore_irp_read_counter, + .read_counter = hswep_uncore_irp_read_counter, }; static struct intel_uncore_type hswep_uncore_irp = { diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 5433658e598..e7d8c760847 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -72,7 +72,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (c->x86_mask || c->cpuid_level >= 0) seq_printf(m, "stepping\t: %d\n", c->x86_mask); else - seq_printf(m, "stepping\t: unknown\n"); + seq_puts(m, "stepping\t: unknown\n"); if (c->microcode) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); @@ -92,12 +92,12 @@ static int show_cpuinfo(struct seq_file *m, void *v) show_cpuinfo_core(m, c, cpu); show_cpuinfo_misc(m, c); - seq_printf(m, "flags\t\t:"); + seq_puts(m, "flags\t\t:"); for (i = 0; i < 32*NCAPINTS; i++) if (cpu_has(c, i) && x86_cap_flags[i] != NULL) seq_printf(m, " %s", x86_cap_flags[i]); - seq_printf(m, "\nbugs\t\t:"); + seq_puts(m, "\nbugs\t\t:"); for (i = 0; i < 32*NBUGINTS; i++) { unsigned int bug_bit = 32*NCAPINTS + i; @@ -118,7 +118,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); - seq_printf(m, "power management:"); + seq_puts(m, "power management:"); for (i = 0; i < 32; i++) { if (c->x86_power & (1 << i)) { if (i < ARRAY_SIZE(x86_power_flags) && @@ -131,7 +131,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) } } - seq_printf(m, "\n\n"); + seq_puts(m, "\n\n"); return 0; } diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 3225ae6c518..83741a71558 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -143,7 +143,7 @@ static int cpuid_device_create(int cpu) dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, "cpu%d", cpu); - return IS_ERR(dev) ? PTR_ERR(dev) : 0; + return PTR_ERR_OR_ZERO(dev); } static void cpuid_device_destroy(int cpu) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 1abcb50b48a..ff86f19b575 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -24,7 +24,6 @@ static char x86_stack_ids[][8] = { [ DEBUG_STACK-1 ] = "#DB", [ NMI_STACK-1 ] = "NMI", [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ STACKFAULT_STACK-1 ] = "#SS", [ MCE_STACK-1 ] = "#MC", #if DEBUG_STKSZ > EXCEPTION_STKSZ [ N_EXCEPTION_STACKS ... diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index b553ed89e5f..344b63f18d1 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -447,15 +447,14 @@ sysenter_exit: sysenter_audit: testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) jnz syscall_trace_entry - addl $4,%esp - CFI_ADJUST_CFA_OFFSET -4 - movl %esi,4(%esp) /* 5th arg: 4th syscall arg */ - movl %edx,(%esp) /* 4th arg: 3rd syscall arg */ - /* %ecx already in %ecx 3rd arg: 2nd syscall arg */ - movl %ebx,%edx /* 2nd arg: 1st syscall arg */ - /* %eax already in %eax 1st arg: syscall number */ + /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ + movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ + /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ + pushl_cfi PT_ESI(%esp) /* a3: 5th arg */ + pushl_cfi PT_EDX+4(%esp) /* a2: 4th arg */ call __audit_syscall_entry - pushl_cfi %ebx + popl_cfi %ecx /* get that remapped edx off the stack */ + popl_cfi %ecx /* get that remapped esi off the stack */ movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index df088bb03fb..c0226ab5410 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -828,9 +828,15 @@ ENTRY(native_iret) jnz native_irq_return_ldt #endif +.global native_irq_return_iret native_irq_return_iret: + /* + * This may fault. Non-paranoid faults on return to userspace are + * handled by fixup_bad_iret. These include #SS, #GP, and #NP. + * Double-faults due to espfix64 are handled in do_double_fault. + * Other faults here are fatal. + */ iretq - _ASM_EXTABLE(native_irq_return_iret, bad_iret) #ifdef CONFIG_X86_ESPFIX64 native_irq_return_ldt: @@ -858,25 +864,6 @@ native_irq_return_ldt: jmp native_irq_return_iret #endif - .section .fixup,"ax" -bad_iret: - /* - * The iret traps when the %cs or %ss being restored is bogus. - * We've lost the original trap vector and error code. - * #GPF is the most likely one to get for an invalid selector. - * So pretend we completed the iret and took the #GPF in user mode. - * - * We are now running with the kernel GS after exception recovery. - * But error_entry expects us to have user GS to match the user %cs, - * so swap back. - */ - pushq $0 - - SWAPGS - jmp general_protection - - .previous - /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE @@ -922,37 +909,6 @@ ENTRY(retint_kernel) CFI_ENDPROC END(common_interrupt) - /* - * If IRET takes a fault on the espfix stack, then we - * end up promoting it to a doublefault. In that case, - * modify the stack to make it look like we just entered - * the #GP handler from user space, similar to bad_iret. - */ -#ifdef CONFIG_X86_ESPFIX64 - ALIGN -__do_double_fault: - XCPT_FRAME 1 RDI+8 - movq RSP(%rdi),%rax /* Trap on the espfix stack? */ - sarq $PGDIR_SHIFT,%rax - cmpl $ESPFIX_PGD_ENTRY,%eax - jne do_double_fault /* No, just deliver the fault */ - cmpl $__KERNEL_CS,CS(%rdi) - jne do_double_fault - movq RIP(%rdi),%rax - cmpq $native_irq_return_iret,%rax - jne do_double_fault /* This shouldn't happen... */ - movq PER_CPU_VAR(kernel_stack),%rax - subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */ - movq %rax,RSP(%rdi) - movq $0,(%rax) /* Missing (lost) #GP error code */ - movq $general_protection,RIP(%rdi) - retq - CFI_ENDPROC -END(__do_double_fault) -#else -# define __do_double_fault do_double_fault -#endif - /* * APIC interrupts. */ @@ -1124,7 +1080,7 @@ idtentry overflow do_overflow has_error_code=0 idtentry bounds do_bounds has_error_code=0 idtentry invalid_op do_invalid_op has_error_code=0 idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault __do_double_fault has_error_code=1 paranoid=1 +idtentry double_fault do_double_fault has_error_code=1 paranoid=1 idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 idtentry invalid_TSS do_invalid_TSS has_error_code=1 idtentry segment_not_present do_segment_not_present has_error_code=1 @@ -1289,7 +1245,7 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1 +idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN idtentry xen_debug do_debug has_error_code=0 idtentry xen_int3 do_int3 has_error_code=0 @@ -1399,17 +1355,16 @@ error_sti: /* * There are two places in the kernel that can potentially fault with - * usergs. Handle them here. The exception handlers after iret run with - * kernel gs again, so don't set the user space flag. B stepping K8s - * sometimes report an truncated RIP for IRET exceptions returning to - * compat mode. Check for these here too. + * usergs. Handle them here. B stepping K8s sometimes report a + * truncated RIP for IRET exceptions returning to compat mode. Check + * for these here too. */ error_kernelspace: CFI_REL_OFFSET rcx, RCX+8 incl %ebx leaq native_irq_return_iret(%rip),%rcx cmpq %rcx,RIP+8(%rsp) - je error_swapgs + je error_bad_iret movl %ecx,%eax /* zero extend */ cmpq %rax,RIP+8(%rsp) je bstep_iret @@ -1420,7 +1375,15 @@ error_kernelspace: bstep_iret: /* Fix truncated RIP */ movq %rcx,RIP+8(%rsp) - jmp error_swapgs + /* fall through */ + +error_bad_iret: + SWAPGS + mov %rsp,%rdi + call fixup_bad_iret + mov %rax,%rsp + decl %ebx /* Return to usergs */ + jmp error_sti CFI_ENDPROC END(error_entry) diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 94d857fb103..f5d0730e7b0 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -122,9 +122,6 @@ static void init_espfix_random(void) void __init init_espfix_bsp(void) { pgd_t *pgd_p; - pteval_t ptemask; - - ptemask = __supported_pte_mask; /* Install the espfix pud into the kernel page directory */ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 8af817105e2..e7cc5370cd2 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -111,8 +111,7 @@ static void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1<<irq); - irq_set_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, - i8259A_chip.name); + irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq); enable_irq(irq); } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 922d2858102..6307a0f0cf1 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -59,78 +59,78 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%*s: ", prec, "NMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); - seq_printf(p, " Non-maskable interrupts\n"); + seq_puts(p, " Non-maskable interrupts\n"); #ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "%*s: ", prec, "LOC"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); - seq_printf(p, " Local timer interrupts\n"); + seq_puts(p, " Local timer interrupts\n"); seq_printf(p, "%*s: ", prec, "SPU"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); + seq_puts(p, " Spurious interrupts\n"); seq_printf(p, "%*s: ", prec, "PMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); - seq_printf(p, " Performance monitoring interrupts\n"); + seq_puts(p, " Performance monitoring interrupts\n"); seq_printf(p, "%*s: ", prec, "IWI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); - seq_printf(p, " IRQ work interrupts\n"); + seq_puts(p, " IRQ work interrupts\n"); seq_printf(p, "%*s: ", prec, "RTR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); - seq_printf(p, " APIC ICR read retries\n"); + seq_puts(p, " APIC ICR read retries\n"); #endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); - seq_printf(p, " Platform interrupts\n"); + seq_puts(p, " Platform interrupts\n"); } #ifdef CONFIG_SMP seq_printf(p, "%*s: ", prec, "RES"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); - seq_printf(p, " Rescheduling interrupts\n"); + seq_puts(p, " Rescheduling interrupts\n"); seq_printf(p, "%*s: ", prec, "CAL"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - irq_stats(j)->irq_tlb_count); - seq_printf(p, " Function call interrupts\n"); + seq_puts(p, " Function call interrupts\n"); seq_printf(p, "%*s: ", prec, "TLB"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); - seq_printf(p, " TLB shootdowns\n"); + seq_puts(p, " TLB shootdowns\n"); #endif #ifdef CONFIG_X86_THERMAL_VECTOR seq_printf(p, "%*s: ", prec, "TRM"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); - seq_printf(p, " Thermal event interrupts\n"); + seq_puts(p, " Thermal event interrupts\n"); #endif #ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); - seq_printf(p, " Threshold APIC interrupts\n"); + seq_puts(p, " Threshold APIC interrupts\n"); #endif #ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); - seq_printf(p, " Machine check exceptions\n"); + seq_puts(p, " Machine check exceptions\n"); seq_printf(p, "%*s: ", prec, "MCP"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); - seq_printf(p, " Machine check polls\n"); + seq_puts(p, " Machine check polls\n"); #endif #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count); - seq_printf(p, " Hypervisor callback interrupts\n"); + seq_puts(p, " Hypervisor callback interrupts\n"); #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 44f1ed42fdf..4de73ee7836 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -70,7 +70,6 @@ int vector_used_by_percpu_irq(unsigned int vector) void __init init_ISA_irqs(void) { struct irq_chip *chip = legacy_pic->chip; - const char *name = chip->name; int i; #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) @@ -79,7 +78,7 @@ void __init init_ISA_irqs(void) legacy_pic->init(0); for (i = 0; i < nr_legacy_irqs(); i++) - irq_set_chip_and_handler_name(i, chip, handle_level_irq, name); + irq_set_chip_and_handler(i, chip, handle_level_irq); } void __init init_IRQ(void) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 67e6d19ef1b..f7e3cd50ece 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -285,7 +285,7 @@ static int can_probe(unsigned long paddr) * normally used, we just go through if there is no kprobe. */ __addr = recover_probed_instruction(buf, addr); - kernel_insn_init(&insn, (void *)__addr); + kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE); insn_get_length(&insn); /* @@ -330,8 +330,10 @@ int __copy_instruction(u8 *dest, u8 *src) { struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; + unsigned long recovered_insn = + recover_probed_instruction(buf, (unsigned long)src); - kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src)); + kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); /* Another subsystem puts a breakpoint, failed to recover */ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) @@ -342,7 +344,7 @@ int __copy_instruction(u8 *dest, u8 *src) if (insn_rip_relative(&insn)) { s64 newdisp; u8 *disp; - kernel_insn_init(&insn, dest); + kernel_insn_init(&insn, dest, insn.length); insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index f1314d0bcf0..7c523bbf3dc 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -251,13 +251,15 @@ static int can_optimize(unsigned long paddr) /* Decode instructions */ addr = paddr - offset; while (addr < paddr - offset + size) { /* Decode until function end */ + unsigned long recovered_insn; if (search_exception_tables(addr)) /* * Since some fixup code will jumps into this function, * we can't optimize kprobe in this function. */ return 0; - kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); + recovered_insn = recover_probed_instruction(buf, addr); + kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); /* Another subsystem puts a breakpoint */ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index c9603ac80de..113e7078485 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -22,6 +22,8 @@ * an SMP box will direct the access to CPU %d. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/types.h> @@ -50,11 +52,11 @@ static loff_t msr_seek(struct file *file, loff_t offset, int orig) mutex_lock(&inode->i_mutex); switch (orig) { - case 0: + case SEEK_SET: file->f_pos = offset; ret = file->f_pos; break; - case 1: + case SEEK_CUR: file->f_pos += offset; ret = file->f_pos; break; @@ -206,7 +208,7 @@ static int msr_device_create(int cpu) dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, "msr%d", cpu); - return IS_ERR(dev) ? PTR_ERR(dev) : 0; + return PTR_ERR_OR_ZERO(dev); } static void msr_device_destroy(int cpu) @@ -248,8 +250,7 @@ static int __init msr_init(void) i = 0; if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) { - printk(KERN_ERR "msr: unable to get major %d for msr\n", - MSR_MAJOR); + pr_err("unable to get major %d for msr\n", MSR_MAJOR); err = -EBUSY; goto out; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 749b0e42341..e510618b2e9 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1484,7 +1484,7 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) */ if (work & _TIF_NOHZ) { user_exit(); - work &= ~TIF_NOHZ; + work &= ~_TIF_NOHZ; } #ifdef CONFIG_SECCOMP diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 235cfd39e0d..214245d6b99 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -960,6 +960,8 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = _brk_end; + mpx_mm_init(&init_mm); + code_resource.start = __pa_symbol(_text); code_resource.end = __pa_symbol(_etext)-1; data_resource.start = __pa_symbol(_etext); @@ -1128,7 +1130,6 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); memblock_set_current_limit(get_max_mapped()); - dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT); /* * NOTE: On x86-32, only from this point on, fixmaps are ready for use. @@ -1159,6 +1160,7 @@ void __init setup_arch(char **cmdline_p) early_acpi_boot_init(); initmem_init(); + dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT); /* * Reserve memory for crash kernel after SRAT is parsed so that it diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 5cdff035774..e4fcb87ba7a 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -30,7 +30,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); #define BOOT_PERCPU_OFFSET 0 #endif -DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; EXPORT_PER_CPU_SYMBOL(this_cpu_off); unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2d5200e5635..7a8f5845e8e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -99,11 +99,9 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); /* Per CPU bogomips and other parameters */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); +DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); -static DEFINE_PER_CPU(struct completion, die_complete); - atomic_t init_deasserted; /* @@ -1305,10 +1303,14 @@ static void __ref remove_cpu_from_maps(int cpu) numa_remove_cpu(cpu); } +static DEFINE_PER_CPU(struct completion, die_complete); + void cpu_disable_common(void) { int cpu = smp_processor_id(); + init_completion(&per_cpu(die_complete, smp_processor_id())); + remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ @@ -1327,16 +1329,21 @@ int native_cpu_disable(void) return ret; clear_local_APIC(); - init_completion(&per_cpu(die_complete, smp_processor_id())); cpu_disable_common(); return 0; } +void cpu_die_common(unsigned int cpu) +{ + wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ); +} + void native_cpu_die(unsigned int cpu) { /* We don't do anything here: idle task is faking death itself. */ - wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ); + + cpu_die_common(cpu); /* They ack this in play_dead() by setting CPU_DEAD */ if (per_cpu(cpu_state, cpu) == CPU_DEAD) { diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c index 193ec2ce46c..160386e9fc1 100644 --- a/arch/x86/kernel/sysfb.c +++ b/arch/x86/kernel/sysfb.c @@ -67,7 +67,7 @@ static __init int sysfb_init(void) pd = platform_device_register_resndata(NULL, name, 0, NULL, 0, si, sizeof(*si)); - return IS_ERR(pd) ? PTR_ERR(pd) : 0; + return PTR_ERR_OR_ZERO(pd); } /* must execute after PCI subsystem for EFI quirks */ diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c index 86179d40989..764a29f84de 100644 --- a/arch/x86/kernel/sysfb_simplefb.c +++ b/arch/x86/kernel/sysfb_simplefb.c @@ -88,8 +88,5 @@ __init int create_simplefb(const struct screen_info *si, pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0, &res, 1, mode, sizeof(*mode)); - if (IS_ERR(pd)) - return PTR_ERR(pd); - - return 0; + return PTR_ERR_OR_ZERO(pd); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 0d0e922fafc..a9ae2057989 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -60,6 +60,7 @@ #include <asm/fixmap.h> #include <asm/mach_traps.h> #include <asm/alternative.h> +#include <asm/mpx.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -228,37 +229,44 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error) DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) -DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun) DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) -#ifdef CONFIG_X86_32 DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) -#endif DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) #ifdef CONFIG_X86_64 /* Runs on IST stack */ -dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) -{ - enum ctx_state prev_state; - - prev_state = exception_enter(); - if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) { - preempt_conditional_sti(regs); - do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); - } - exception_exit(prev_state); -} - dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) { static const char str[] = "double fault"; struct task_struct *tsk = current; +#ifdef CONFIG_X86_ESPFIX64 + extern unsigned char native_irq_return_iret[]; + + /* + * If IRET takes a non-IST fault on the espfix64 stack, then we + * end up promoting it to a doublefault. In that case, modify + * the stack to make it look like we just entered the #GP + * handler from user space, similar to bad_iret. + */ + if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && + regs->cs == __KERNEL_CS && + regs->ip == (unsigned long)native_irq_return_iret) + { + struct pt_regs *normal_regs = task_pt_regs(current); + + /* Fake a #GP(0) from userspace. */ + memmove(&normal_regs->ip, (void *)regs->sp, 5*8); + normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ + regs->ip = (unsigned long)general_protection; + regs->sp = (unsigned long)&normal_regs->orig_ax; + return; + } +#endif + exception_enter(); /* Return not checked because double check cannot be ignored */ notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -278,6 +286,89 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) } #endif +dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) +{ + struct task_struct *tsk = current; + struct xsave_struct *xsave_buf; + enum ctx_state prev_state; + struct bndcsr *bndcsr; + siginfo_t *info; + + prev_state = exception_enter(); + if (notify_die(DIE_TRAP, "bounds", regs, error_code, + X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) + goto exit; + conditional_sti(regs); + + if (!user_mode(regs)) + die("bounds", regs, error_code); + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) { + /* The exception is not from Intel MPX */ + goto exit_trap; + } + + /* + * We need to look at BNDSTATUS to resolve this exception. + * It is not directly accessible, though, so we need to + * do an xsave and then pull it out of the xsave buffer. + */ + fpu_save_init(&tsk->thread.fpu); + xsave_buf = &(tsk->thread.fpu.state->xsave); + bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); + if (!bndcsr) + goto exit_trap; + + /* + * The error code field of the BNDSTATUS register communicates status + * information of a bound range exception #BR or operation involving + * bound directory. + */ + switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) { + case 2: /* Bound directory has invalid entry. */ + if (mpx_handle_bd_fault(xsave_buf)) + goto exit_trap; + break; /* Success, it was handled */ + case 1: /* Bound violation. */ + info = mpx_generate_siginfo(regs, xsave_buf); + if (PTR_ERR(info)) { + /* + * We failed to decode the MPX instruction. Act as if + * the exception was not caused by MPX. + */ + goto exit_trap; + } + /* + * Success, we decoded the instruction and retrieved + * an 'info' containing the address being accessed + * which caused the exception. This information + * allows and application to possibly handle the + * #BR exception itself. + */ + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, info); + kfree(info); + break; + case 0: /* No exception caused by Intel MPX operations. */ + goto exit_trap; + default: + die("bounds", regs, error_code); + } + +exit: + exception_exit(prev_state); + return; +exit_trap: + /* + * This path out is for all the cases where we could not + * handle the exception in some way (like allocating a + * table or telling userspace about it. We will also end + * up here if the kernel has MPX turned off at compile + * time.. + */ + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL); + exception_exit(prev_state); +} + dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) { @@ -379,7 +470,7 @@ NOKPROBE_SYMBOL(do_int3); * for scheduling or signal handling. The actual stack switch is done in * entry.S */ -asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; /* Did already sync */ @@ -399,6 +490,36 @@ asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs) return regs; } NOKPROBE_SYMBOL(sync_regs); + +struct bad_iret_stack { + void *error_entry_ret; + struct pt_regs regs; +}; + +asmlinkage __visible notrace +struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) +{ + /* + * This is called from entry_64.S early in handling a fault + * caused by a bad iret to user mode. To handle the fault + * correctly, we want move our stack frame to task_pt_regs + * and we want to pretend that the exception came from the + * iret target. + */ + struct bad_iret_stack *new_stack = + container_of(task_pt_regs(current), + struct bad_iret_stack, regs); + + /* Copy the IRET target to the new stack. */ + memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); + + /* Copy the remainder of the stack from the current stack. */ + memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); + + BUG_ON(!user_mode_vm(&new_stack->regs)); + return new_stack; +} +NOKPROBE_SYMBOL(fixup_bad_iret); #endif /* @@ -778,7 +899,7 @@ void __init trap_init(void) set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun); set_intr_gate(X86_TRAP_TS, invalid_TSS); set_intr_gate(X86_TRAP_NP, segment_not_present); - set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK); + set_intr_gate(X86_TRAP_SS, stack_segment); set_intr_gate(X86_TRAP_GP, general_protection); set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug); set_intr_gate(X86_TRAP_MF, coprocessor_error); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index b6025f9e36c..b7e50bba3bb 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1166,14 +1166,17 @@ void __init tsc_init(void) x86_init.timers.tsc_pre_init(); - if (!cpu_has_tsc) + if (!cpu_has_tsc) { + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; + } tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; if (!tsc_khz) { mark_tsc_unstable("could not calculate TSC khz"); + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 5d1cbfe4ae5..8b96a947021 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -219,7 +219,7 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool { u32 volatile *good_insns; - insn_init(insn, auprobe->insn, x86_64); + insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64); /* has the side-effect of processing the entire instruction */ insn_get_length(insn); if (WARN_ON_ONCE(!insn_complete(insn))) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 49edf2dd361..00bf300fd84 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -186,6 +186,8 @@ SECTIONS * start another segment - init. */ PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu) + ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START, + "per-CPU data too large - increase CONFIG_PHYSICAL_START") #endif INIT_TEXT_SECTION(PAGE_SIZE) diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index e48b674639c..234b0722de5 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -116,8 +116,6 @@ struct x86_msi_ops x86_msi = { .teardown_msi_irqs = default_teardown_msi_irqs, .restore_msi_irqs = default_restore_msi_irqs, .setup_hpet_msi = default_setup_hpet_msi, - .msi_mask_irq = default_msi_mask_irq, - .msix_mask_irq = default_msix_mask_irq, }; /* MSI arch specific hooks */ @@ -140,14 +138,6 @@ void arch_restore_msi_irqs(struct pci_dev *dev) { x86_msi.restore_msi_irqs(dev); } -u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) -{ - return x86_msi.msi_mask_irq(desc, mask, flag); -} -u32 arch_msix_mask_irq(struct msi_desc *desc, u32 flag) -{ - return x86_msi.msix_mask_irq(desc, flag); -} #endif struct x86_io_apic_ops x86_io_apic_ops = { diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a46207a0583..9f8a2faf504 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -504,11 +504,6 @@ static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc); } -static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) -{ - register_address_increment(ctxt, &ctxt->_eip, rel); -} - static u32 desc_limit_scaled(struct desc_struct *desc) { u32 limit = get_desc_limit(desc); @@ -569,6 +564,40 @@ static int emulate_nm(struct x86_emulate_ctxt *ctxt) return emulate_exception(ctxt, NM_VECTOR, 0, false); } +static inline int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, + int cs_l) +{ + switch (ctxt->op_bytes) { + case 2: + ctxt->_eip = (u16)dst; + break; + case 4: + ctxt->_eip = (u32)dst; + break; +#ifdef CONFIG_X86_64 + case 8: + if ((cs_l && is_noncanonical_address(dst)) || + (!cs_l && (dst >> 32) != 0)) + return emulate_gp(ctxt, 0); + ctxt->_eip = dst; + break; +#endif + default: + WARN(1, "unsupported eip assignment size\n"); + } + return X86EMUL_CONTINUE; +} + +static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) +{ + return assign_eip_far(ctxt, dst, ctxt->mode == X86EMUL_MODE_PROT64); +} + +static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) +{ + return assign_eip_near(ctxt, ctxt->_eip + rel); +} + static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) { u16 selector; @@ -614,7 +643,8 @@ static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size) static int __linearize(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, - unsigned size, bool write, bool fetch, + unsigned *max_size, unsigned size, + bool write, bool fetch, ulong *linear) { struct desc_struct desc; @@ -625,10 +655,15 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, unsigned cpl; la = seg_base(ctxt, addr.seg) + addr.ea; + *max_size = 0; switch (ctxt->mode) { case X86EMUL_MODE_PROT64: if (((signed long)la << 16) >> 16 != la) return emulate_gp(ctxt, 0); + + *max_size = min_t(u64, ~0u, (1ull << 48) - la); + if (size > *max_size) + goto bad; break; default: usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL, @@ -646,20 +681,25 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, if ((ctxt->mode == X86EMUL_MODE_REAL) && !fetch && (ctxt->d & NoBigReal)) { /* la is between zero and 0xffff */ - if (la > 0xffff || (u32)(la + size - 1) > 0xffff) + if (la > 0xffff) goto bad; + *max_size = 0x10000 - la; } else if ((desc.type & 8) || !(desc.type & 4)) { /* expand-up segment */ - if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) + if (addr.ea > lim) goto bad; + *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea); } else { /* expand-down segment */ - if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) + if (addr.ea <= lim) goto bad; lim = desc.d ? 0xffffffff : 0xffff; - if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) + if (addr.ea > lim) goto bad; + *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea); } + if (size > *max_size) + goto bad; cpl = ctxt->ops->cpl(ctxt); if (!(desc.type & 8)) { /* data segment */ @@ -684,9 +724,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; bad: if (addr.seg == VCPU_SREG_SS) - return emulate_ss(ctxt, sel); + return emulate_ss(ctxt, 0); else - return emulate_gp(ctxt, sel); + return emulate_gp(ctxt, 0); } static int linearize(struct x86_emulate_ctxt *ctxt, @@ -694,7 +734,8 @@ static int linearize(struct x86_emulate_ctxt *ctxt, unsigned size, bool write, ulong *linear) { - return __linearize(ctxt, addr, size, write, false, linear); + unsigned max_size; + return __linearize(ctxt, addr, &max_size, size, write, false, linear); } @@ -719,17 +760,27 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) { int rc; - unsigned size; + unsigned size, max_size; unsigned long linear; int cur_size = ctxt->fetch.end - ctxt->fetch.data; struct segmented_address addr = { .seg = VCPU_SREG_CS, .ea = ctxt->eip + cur_size }; - size = 15UL ^ cur_size; - rc = __linearize(ctxt, addr, size, false, true, &linear); + /* + * We do not know exactly how many bytes will be needed, and + * __linearize is expensive, so fetch as much as possible. We + * just have to avoid going beyond the 15 byte limit, the end + * of the segment, or the end of the page. + * + * __linearize is called with size 0 so that it does not do any + * boundary check itself. Instead, we use max_size to check + * against op_size. + */ + rc = __linearize(ctxt, addr, &max_size, 0, false, true, &linear); if (unlikely(rc != X86EMUL_CONTINUE)) return rc; + size = min_t(unsigned, 15UL ^ cur_size, max_size); size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear)); /* @@ -739,7 +790,8 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) * still, we must have hit the 15-byte boundary. */ if (unlikely(size < op_size)) - return X86EMUL_UNHANDLEABLE; + return emulate_gp(ctxt, 0); + rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end, size, &ctxt->exception); if (unlikely(rc != X86EMUL_CONTINUE)) @@ -751,8 +803,10 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, unsigned size) { - if (unlikely(ctxt->fetch.end - ctxt->fetch.ptr < size)) - return __do_insn_fetch_bytes(ctxt, size); + unsigned done_size = ctxt->fetch.end - ctxt->fetch.ptr; + + if (unlikely(done_size < size)) + return __do_insn_fetch_bytes(ctxt, size - done_size); else return X86EMUL_CONTINUE; } @@ -1416,7 +1470,9 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, /* Does not support long mode */ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, int seg, u8 cpl, bool in_task_switch) + u16 selector, int seg, u8 cpl, + bool in_task_switch, + struct desc_struct *desc) { struct desc_struct seg_desc, old_desc; u8 dpl, rpl; @@ -1557,6 +1613,8 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, } load: ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); + if (desc) + *desc = seg_desc; return X86EMUL_CONTINUE; exception: return emulate_exception(ctxt, err_vec, err_code, true); @@ -1566,7 +1624,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { u8 cpl = ctxt->ops->cpl(ctxt); - return __load_segment_descriptor(ctxt, selector, seg, cpl, false); + return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL); } static void write_register_operand(struct operand *op) @@ -1960,17 +2018,31 @@ static int em_iret(struct x86_emulate_ctxt *ctxt) static int em_jmp_far(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned short sel; + unsigned short sel, old_sel; + struct desc_struct old_desc, new_desc; + const struct x86_emulate_ops *ops = ctxt->ops; + u8 cpl = ctxt->ops->cpl(ctxt); + + /* Assignment of RIP may only fail in 64-bit mode */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + ops->get_segment(ctxt, &old_sel, &old_desc, NULL, + VCPU_SREG_CS); memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS); + rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, + &new_desc); if (rc != X86EMUL_CONTINUE) return rc; - ctxt->_eip = 0; - memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); - return X86EMUL_CONTINUE; + rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l); + if (rc != X86EMUL_CONTINUE) { + WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64); + /* assigning eip failed; restore the old cs */ + ops->set_segment(ctxt, old_sel, &old_desc, 0, VCPU_SREG_CS); + return rc; + } + return rc; } static int em_grp45(struct x86_emulate_ctxt *ctxt) @@ -1981,13 +2053,15 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) case 2: /* call near abs */ { long int old_eip; old_eip = ctxt->_eip; - ctxt->_eip = ctxt->src.val; + rc = assign_eip_near(ctxt, ctxt->src.val); + if (rc != X86EMUL_CONTINUE) + break; ctxt->src.val = old_eip; rc = em_push(ctxt); break; } case 4: /* jmp abs */ - ctxt->_eip = ctxt->src.val; + rc = assign_eip_near(ctxt, ctxt->src.val); break; case 5: /* jmp far */ rc = em_jmp_far(ctxt); @@ -2022,30 +2096,47 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) static int em_ret(struct x86_emulate_ctxt *ctxt) { - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = &ctxt->_eip; - ctxt->dst.bytes = ctxt->op_bytes; - return em_pop(ctxt); + int rc; + unsigned long eip; + + rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + + return assign_eip_near(ctxt, eip); } static int em_ret_far(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned long cs; + unsigned long eip, cs; + u16 old_cs; int cpl = ctxt->ops->cpl(ctxt); + struct desc_struct old_desc, new_desc; + const struct x86_emulate_ops *ops = ctxt->ops; - rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes); + if (ctxt->mode == X86EMUL_MODE_PROT64) + ops->get_segment(ctxt, &old_cs, &old_desc, NULL, + VCPU_SREG_CS); + + rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - if (ctxt->op_bytes == 4) - ctxt->_eip = (u32)ctxt->_eip; rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; /* Outer-privilege level return is not implemented */ if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) return X86EMUL_UNHANDLEABLE; - rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); + rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, 0, false, + &new_desc); + if (rc != X86EMUL_CONTINUE) + return rc; + rc = assign_eip_far(ctxt, eip, new_desc.l); + if (rc != X86EMUL_CONTINUE) { + WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64); + ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); + } return rc; } @@ -2306,7 +2397,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) { const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; - u64 msr_data; + u64 msr_data, rcx, rdx; int usermode; u16 cs_sel = 0, ss_sel = 0; @@ -2322,6 +2413,9 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) else usermode = X86EMUL_MODE_PROT32; + rcx = reg_read(ctxt, VCPU_REGS_RCX); + rdx = reg_read(ctxt, VCPU_REGS_RDX); + cs.dpl = 3; ss.dpl = 3; ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); @@ -2339,6 +2433,9 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ss_sel = cs_sel + 8; cs.d = 0; cs.l = 1; + if (is_noncanonical_address(rcx) || + is_noncanonical_address(rdx)) + return emulate_gp(ctxt, 0); break; } cs_sel |= SELECTOR_RPL_MASK; @@ -2347,8 +2444,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX); - *reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX); + ctxt->_eip = rdx; + *reg_write(ctxt, VCPU_REGS_RSP) = rcx; return X86EMUL_CONTINUE; } @@ -2466,19 +2563,24 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, * Now load segment descriptors. If fault happens at this stage * it is handled in a context of new task */ - ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; @@ -2603,25 +2705,32 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * Now load segment descriptors. If fault happenes at this stage * it is handled in a context of new task */ - ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, + cpl, true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; @@ -2888,10 +2997,13 @@ static int em_aad(struct x86_emulate_ctxt *ctxt) static int em_call(struct x86_emulate_ctxt *ctxt) { + int rc; long rel = ctxt->src.val; ctxt->src.val = (unsigned long)ctxt->_eip; - jmp_rel(ctxt, rel); + rc = jmp_rel(ctxt, rel); + if (rc != X86EMUL_CONTINUE) + return rc; return em_push(ctxt); } @@ -2900,34 +3012,50 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) u16 sel, old_cs; ulong old_eip; int rc; + struct desc_struct old_desc, new_desc; + const struct x86_emulate_ops *ops = ctxt->ops; + int cpl = ctxt->ops->cpl(ctxt); - old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); old_eip = ctxt->_eip; + ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS); memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS)) + rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, + &new_desc); + if (rc != X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - ctxt->_eip = 0; - memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); + rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l); + if (rc != X86EMUL_CONTINUE) + goto fail; ctxt->src.val = old_cs; rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) - return rc; + goto fail; ctxt->src.val = old_eip; - return em_push(ctxt); + rc = em_push(ctxt); + /* If we failed, we tainted the memory, but the very least we should + restore cs */ + if (rc != X86EMUL_CONTINUE) + goto fail; + return rc; +fail: + ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); + return rc; + } static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) { int rc; + unsigned long eip; - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = &ctxt->_eip; - ctxt->dst.bytes = ctxt->op_bytes; - rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); + rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + rc = assign_eip_near(ctxt, eip); if (rc != X86EMUL_CONTINUE) return rc; rsp_increment(ctxt, ctxt->src.val); @@ -3254,20 +3382,24 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt) static int em_loop(struct x86_emulate_ctxt *ctxt) { + int rc = X86EMUL_CONTINUE; + register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1); if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) && (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); - return X86EMUL_CONTINUE; + return rc; } static int em_jcxz(struct x86_emulate_ctxt *ctxt) { + int rc = X86EMUL_CONTINUE; + if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); - return X86EMUL_CONTINUE; + return rc; } static int em_in(struct x86_emulate_ctxt *ctxt) @@ -3355,6 +3487,12 @@ static int em_bswap(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_clflush(struct x86_emulate_ctxt *ctxt) +{ + /* emulating clflush regardless of cpuid */ + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3693,6 +3831,16 @@ static const struct opcode group11[] = { X7(D(Undefined)), }; +static const struct gprefix pfx_0f_ae_7 = { + I(SrcMem | ByteOp, em_clflush), N, N, N, +}; + +static const struct group_dual group15 = { { + N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7), +}, { + N, N, N, N, N, N, N, N, +} }; + static const struct gprefix pfx_0f_6f_0f_7f = { I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), }; @@ -3901,10 +4049,11 @@ static const struct opcode twobyte_table[256] = { N, I(ImplicitOps | EmulateOnUD, em_syscall), II(ImplicitOps | Priv, em_clts, clts), N, DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, - N, D(ImplicitOps | ModRM), N, N, + N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, /* 0x10 - 0x1F */ N, N, N, N, N, N, N, N, - D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM), + D(ImplicitOps | ModRM | SrcMem | NoAccess), + N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 0x20 - 0x2F */ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read), DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read), @@ -3956,7 +4105,7 @@ static const struct opcode twobyte_table[256] = { F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), - D(ModRM), F(DstReg | SrcMem | ModRM, em_imul), + GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), @@ -4138,6 +4287,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, fetch_register_operand(op); break; case OpCL: + op->type = OP_IMM; op->bytes = 1; op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff; break; @@ -4145,6 +4295,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, rc = decode_imm(ctxt, op, 1, true); break; case OpOne: + op->type = OP_IMM; op->bytes = 1; op->val = 1; break; @@ -4203,21 +4354,27 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, ctxt->memop.bytes = ctxt->op_bytes + 2; goto mem_common; case OpES: + op->type = OP_IMM; op->val = VCPU_SREG_ES; break; case OpCS: + op->type = OP_IMM; op->val = VCPU_SREG_CS; break; case OpSS: + op->type = OP_IMM; op->val = VCPU_SREG_SS; break; case OpDS: + op->type = OP_IMM; op->val = VCPU_SREG_DS; break; case OpFS: + op->type = OP_IMM; op->val = VCPU_SREG_FS; break; case OpGS: + op->type = OP_IMM; op->val = VCPU_SREG_GS; break; case OpImplicit: @@ -4473,10 +4630,10 @@ done_prefixes: /* Decode and fetch the destination operand: register or memory. */ rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); -done: if (ctxt->rip_relative) ctxt->memopp->addr.mem.ea += ctxt->_eip; +done: return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; } @@ -4726,7 +4883,7 @@ special_insn: break; case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(ctxt->b, ctxt->eflags)) - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); break; case 0x8d: /* lea r16/r32, m */ ctxt->dst.val = ctxt->src.addr.mem.ea; @@ -4756,7 +4913,7 @@ special_insn: break; case 0xe9: /* jmp rel */ case 0xeb: /* jmp rel short */ - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); ctxt->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ @@ -4881,13 +5038,11 @@ twobyte_insn: break; case 0x80 ... 0x8f: /* jnz rel, etc*/ if (test_cc(ctxt->b, ctxt->eflags)) - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); break; case 0x90 ... 0x9f: /* setcc r/m8 */ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; - case 0xae: /* clflush */ - break; case 0xb6 ... 0xb7: /* movzx */ ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 518d86471b7..298781d4cfb 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -262,8 +262,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) return; timer = &pit->pit_state.timer; + mutex_lock(&pit->pit_state.lock); if (hrtimer_cancel(timer)) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + mutex_unlock(&pit->pit_state.lock); } static void destroy_pit_timer(struct kvm_pit *pit) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ac1c4de3a48..978f402006e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -630,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) * kvm mmu, before reclaiming the page, we should * unmap it from mmu first. */ - WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn))); + WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); @@ -2461,7 +2461,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + kvm_is_reserved_pfn(pfn)); if (host_writable) spte |= SPTE_HOST_WRITEABLE; @@ -2737,7 +2737,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * PT_PAGE_TABLE_LEVEL and there would be no adjustment done * here. */ - if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && + if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 806d58e3c32..fd49c867b25 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -298,7 +298,7 @@ retry_walk: } #endif walker->max_level = walker->level; - ASSERT(!is_long_mode(vcpu) && is_pae(vcpu)); + ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); accessed_dirty = PT_GUEST_ACCESSED_MASK; pt_access = pte_access = ACC_ALL; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 65510f624df..7527cefc5a4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3251,7 +3251,7 @@ static int wrmsr_interception(struct vcpu_svm *svm) msr.host_initiated = false; svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, &msr)) { + if (kvm_set_msr(&svm->vcpu, &msr)) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); } else { @@ -3551,9 +3551,9 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || !svm_exit_handlers[exit_code]) { - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = exit_code; - return 0; + WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; } return svm_exit_handlers[exit_code](svm); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0acac81f198..3e556c68351 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2659,12 +2659,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: msr = find_msr_entry(vmx, msr_index); if (msr) { + u64 old_msr_data = msr->data; msr->data = data; if (msr - vmx->guest_msrs < vmx->save_nmsrs) { preempt_disable(); - kvm_set_shared_msr(msr->index, msr->data, - msr->mask); + ret = kvm_set_shared_msr(msr->index, msr->data, + msr->mask); preempt_enable(); + if (ret) + msr->data = old_msr_data; } break; } @@ -4576,7 +4579,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write32(TPR_THRESHOLD, 0); } - kvm_vcpu_reload_apic_access_page(vcpu); + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); if (vmx_vm_has_apicv(vcpu->kvm)) memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); @@ -5291,7 +5294,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) msr.data = data; msr.index = ecx; msr.host_initiated = false; - if (vmx_set_msr(vcpu, &msr) != 0) { + if (kvm_set_msr(vcpu, &msr) != 0) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; @@ -6423,6 +6426,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) const unsigned long *fields = shadow_read_write_fields; const int num_fields = max_shadow_read_write_fields; + preempt_disable(); + vmcs_load(shadow_vmcs); for (i = 0; i < num_fields; i++) { @@ -6446,6 +6451,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) vmcs_clear(shadow_vmcs); vmcs_load(vmx->loaded_vmcs->vmcs); + + preempt_enable(); } static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) @@ -6743,6 +6750,12 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; } +static int handle_invvpid(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -6788,6 +6801,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, [EXIT_REASON_INVEPT] = handle_invept, + [EXIT_REASON_INVVPID] = handle_invvpid, }; static const int kvm_vmx_max_exit_handlers = @@ -7023,7 +7037,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: - case EXIT_REASON_INVEPT: + case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: /* * VMX instructions trap unconditionally. This allows L1 to * emulate them for its L2 guest, i.e., allows 3-level nesting! @@ -7164,10 +7178,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); else { - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->run->hw.hardware_exit_reason = exit_reason; + WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; } - return 0; } static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 34c8f94331f..0033df32a74 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -229,20 +229,25 @@ static void kvm_shared_msr_cpu_online(void) shared_msr_update(i, shared_msrs_global.msrs[i]); } -void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) +int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) { unsigned int cpu = smp_processor_id(); struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + int err; if (((value ^ smsr->values[slot].curr) & mask) == 0) - return; + return 0; smsr->values[slot].curr = value; - wrmsrl(shared_msrs_global.msrs[slot], value); + err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); + if (err) + return 1; + if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); smsr->registered = true; } + return 0; } EXPORT_SYMBOL_GPL(kvm_set_shared_msr); @@ -987,7 +992,6 @@ void kvm_enable_efer_bits(u64 mask) } EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); - /* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -995,8 +999,34 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); */ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { + switch (msr->index) { + case MSR_FS_BASE: + case MSR_GS_BASE: + case MSR_KERNEL_GS_BASE: + case MSR_CSTAR: + case MSR_LSTAR: + if (is_noncanonical_address(msr->data)) + return 1; + break; + case MSR_IA32_SYSENTER_EIP: + case MSR_IA32_SYSENTER_ESP: + /* + * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if + * non-canonical address is written on Intel but not on + * AMD (which ignores the top 32-bits, because it does + * not implement 64-bit SYSENTER). + * + * 64-bit code should hence be able to write a non-canonical + * value on AMD. Making the address canonical ensures that + * vmentry does not fail on Intel after writing a non-canonical + * value, and that something deterministic happens if the guest + * invokes 64-bit SYSENTER. + */ + msr->data = get_canonical(msr->data); + } return kvm_x86_ops->set_msr(vcpu, msr); } +EXPORT_SYMBOL_GPL(kvm_set_msr); /* * Adapt set_msr() to msr_io()'s calling convention diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 7609e0e421e..1318f75d56e 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -41,9 +41,8 @@ csum_partial_copy_from_user(const void __user *src, void *dst, while (((unsigned long)src & 6) && len >= 2) { __u16 val16; - *errp = __get_user(val16, (const __u16 __user *)src); - if (*errp) - return isum; + if (__get_user(val16, (const __u16 __user *)src)) + goto out_err; *(__u16 *)dst = val16; isum = (__force __wsum)add32_with_carry( diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 54fcffed28e..2480978b31c 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -28,7 +28,7 @@ /* Verify next sizeof(t) bytes can be on the same instruction */ #define validate_next(t, insn, n) \ - ((insn)->next_byte + sizeof(t) + n - (insn)->kaddr <= MAX_INSN_SIZE) + ((insn)->next_byte + sizeof(t) + n < (insn)->end_kaddr) #define __get_next(t, insn) \ ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) @@ -50,10 +50,11 @@ * @kaddr: address (in kernel memory) of instruction (or copy thereof) * @x86_64: !0 for 64-bit kernel or 64-bit app */ -void insn_init(struct insn *insn, const void *kaddr, int x86_64) +void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) { memset(insn, 0, sizeof(*insn)); insn->kaddr = kaddr; + insn->end_kaddr = kaddr + buf_len; insn->next_byte = kaddr; insn->x86_64 = x86_64 ? 1 : 0; insn->opnd_bytes = 4; diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 6a19ad9f370..ecfdc46a024 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -30,3 +30,5 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_MEMTEST) += memtest.o + +obj-$(CONFIG_X86_INTEL_MPX) += mpx.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 95a427e5788..f0cedf3395a 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -76,6 +76,9 @@ static struct addr_marker address_markers[] = { # ifdef CONFIG_X86_ESPFIX64 { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, # endif +# ifdef CONFIG_EFI + { EFI_VA_END, "EFI Runtime Services" }, +# endif { __START_KERNEL_map, "High Kernel Mapping" }, { MODULES_VADDR, "Modules" }, { MODULES_END, "End Modules" }, @@ -126,7 +129,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) if (!pgprot_val(prot)) { /* Not present */ - pt_dump_cont_printf(m, dmsg, " "); + pt_dump_cont_printf(m, dmsg, " "); } else { if (pr & _PAGE_USER) pt_dump_cont_printf(m, dmsg, "USR "); @@ -145,18 +148,16 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) else pt_dump_cont_printf(m, dmsg, " "); - /* Bit 9 has a different meaning on level 3 vs 4 */ - if (level <= 3) { - if (pr & _PAGE_PSE) - pt_dump_cont_printf(m, dmsg, "PSE "); - else - pt_dump_cont_printf(m, dmsg, " "); - } else { - if (pr & _PAGE_PAT) - pt_dump_cont_printf(m, dmsg, "pat "); - else - pt_dump_cont_printf(m, dmsg, " "); - } + /* Bit 7 has a different meaning on level 3 vs 4 */ + if (level <= 3 && pr & _PAGE_PSE) + pt_dump_cont_printf(m, dmsg, "PSE "); + else + pt_dump_cont_printf(m, dmsg, " "); + if ((level == 4 && pr & _PAGE_PAT) || + ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) + pt_dump_cont_printf(m, dmsg, "pat "); + else + pt_dump_cont_printf(m, dmsg, " "); if (pr & _PAGE_GLOBAL) pt_dump_cont_printf(m, dmsg, "GLB "); else diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 66dba36f234..82b41d56bb9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -27,6 +27,35 @@ #include "mm_internal.h" +/* + * Tables translating between page_cache_type_t and pte encoding. + * Minimal supported modes are defined statically, modified if more supported + * cache modes are available. + * Index into __cachemode2pte_tbl is the cachemode. + * Index into __pte2cachemode_tbl are the caching attribute bits of the pte + * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. + */ +uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { + [_PAGE_CACHE_MODE_WB] = 0, + [_PAGE_CACHE_MODE_WC] = _PAGE_PWT, + [_PAGE_CACHE_MODE_UC_MINUS] = _PAGE_PCD, + [_PAGE_CACHE_MODE_UC] = _PAGE_PCD | _PAGE_PWT, + [_PAGE_CACHE_MODE_WT] = _PAGE_PCD, + [_PAGE_CACHE_MODE_WP] = _PAGE_PCD, +}; +EXPORT_SYMBOL_GPL(__cachemode2pte_tbl); +uint8_t __pte2cachemode_tbl[8] = { + [__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB, + [__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx(_PAGE_PCD)] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD)] = _PAGE_CACHE_MODE_UC, + [__pte2cm_idx(_PAGE_PAT)] = _PAGE_CACHE_MODE_WB, + [__pte2cm_idx(_PAGE_PWT | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, +}; +EXPORT_SYMBOL_GPL(__pte2cachemode_tbl); + static unsigned long __initdata pgt_buf_start; static unsigned long __initdata pgt_buf_end; static unsigned long __initdata pgt_buf_top; @@ -687,3 +716,11 @@ void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } +void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) +{ + /* entry 0 MUST be WB (hardwired to speed up translations) */ + BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB); + + __cachemode2pte_tbl[cache] = __cm_idx2pte(entry); + __pte2cachemode_tbl[entry] = cache; +} diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 4cb8763868f..78e53c80fc1 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -52,7 +52,6 @@ #include <asm/numa.h> #include <asm/cacheflush.h> #include <asm/init.h> -#include <asm/uv/uv.h> #include <asm/setup.h> #include "mm_internal.h" @@ -338,12 +337,15 @@ pte_t * __init populate_extra_pte(unsigned long vaddr) * Create large page table mappings for a range of physical addresses. */ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, - pgprot_t prot) + enum page_cache_mode cache) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; + pgprot_t prot; + pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) | + pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache))); BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { pgd = pgd_offset_k((unsigned long)__va(phys)); @@ -366,12 +368,12 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) { - __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE); + __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB); } void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) { - __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE); + __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC); } /* @@ -1123,7 +1125,7 @@ void mark_rodata_ro(void) unsigned long end = (unsigned long) &__end_rodata_hpage_align; unsigned long text_end = PFN_ALIGN(&__stop___ex_table); unsigned long rodata_end = PFN_ALIGN(&__end_rodata); - unsigned long all_end = PFN_ALIGN(&_end); + unsigned long all_end; printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -1134,7 +1136,16 @@ void mark_rodata_ro(void) /* * The rodata/data/bss/brk section (but not the kernel text!) * should also be not-executable. + * + * We align all_end to PMD_SIZE because the existing mapping + * is a full PMD. If we would align _brk_end to PAGE_SIZE we + * split the PMD and the reminder between _brk_end and the end + * of the PMD will remain mapped executable. + * + * Any PMD which was setup after the one which covers _brk_end + * has been zapped already via cleanup_highmem(). */ + all_end = roundup((unsigned long)_brk_end, PMD_SIZE); set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT); rodata_test(); @@ -1247,12 +1258,10 @@ static unsigned long probe_memory_block_size(void) /* start from 2g */ unsigned long bz = 1UL<<31; -#ifdef CONFIG_X86_UV - if (is_uv_system()) { - printk(KERN_INFO "UV: memory block size 2GB\n"); + if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) { + pr_info("Using 2GB memory block size for large-memory system\n"); return 2UL * 1024 * 1024 * 1024; } -#endif /* less than 64g installed */ if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 7b179b499fa..9ca35fc60cf 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -33,17 +33,17 @@ static int is_io_mapping_possible(resource_size_t base, unsigned long size) int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) { - unsigned long flag = _PAGE_CACHE_WC; + enum page_cache_mode pcm = _PAGE_CACHE_MODE_WC; int ret; if (!is_io_mapping_possible(base, size)) return -EINVAL; - ret = io_reserve_memtype(base, base + size, &flag); + ret = io_reserve_memtype(base, base + size, &pcm); if (ret) return ret; - *prot = __pgprot(__PAGE_KERNEL | flag); + *prot = __pgprot(__PAGE_KERNEL | cachemode2protval(pcm)); return 0; } EXPORT_SYMBOL_GPL(iomap_create_wc); @@ -82,8 +82,10 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) * MTRR is UC or WC. UC_MINUS gets the real intention, of the * user, which is "WC if the MTRR is WC, UC if you can't do that." */ - if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) - prot = PAGE_KERNEL_UC_MINUS; + if (!pat_enabled && pgprot_val(prot) == + (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC))) + prot = __pgprot(__PAGE_KERNEL | + cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index af78e50ca6c..fdf617c00e2 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -29,20 +29,20 @@ * conflicts. */ int ioremap_change_attr(unsigned long vaddr, unsigned long size, - unsigned long prot_val) + enum page_cache_mode pcm) { unsigned long nrpages = size >> PAGE_SHIFT; int err; - switch (prot_val) { - case _PAGE_CACHE_UC: + switch (pcm) { + case _PAGE_CACHE_MODE_UC: default: err = _set_memory_uc(vaddr, nrpages); break; - case _PAGE_CACHE_WC: + case _PAGE_CACHE_MODE_WC: err = _set_memory_wc(vaddr, nrpages); break; - case _PAGE_CACHE_WB: + case _PAGE_CACHE_MODE_WB: err = _set_memory_wb(vaddr, nrpages); break; } @@ -75,14 +75,14 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, * caller shouldn't need to know that small detail. */ static void __iomem *__ioremap_caller(resource_size_t phys_addr, - unsigned long size, unsigned long prot_val, void *caller) + unsigned long size, enum page_cache_mode pcm, void *caller) { unsigned long offset, vaddr; resource_size_t pfn, last_pfn, last_addr; const resource_size_t unaligned_phys_addr = phys_addr; const unsigned long unaligned_size = size; struct vm_struct *area; - unsigned long new_prot_val; + enum page_cache_mode new_pcm; pgprot_t prot; int retval; void __iomem *ret_addr; @@ -134,38 +134,40 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, size = PAGE_ALIGN(last_addr+1) - phys_addr; retval = reserve_memtype(phys_addr, (u64)phys_addr + size, - prot_val, &new_prot_val); + pcm, &new_pcm); if (retval) { printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval); return NULL; } - if (prot_val != new_prot_val) { - if (!is_new_memtype_allowed(phys_addr, size, - prot_val, new_prot_val)) { + if (pcm != new_pcm) { + if (!is_new_memtype_allowed(phys_addr, size, pcm, new_pcm)) { printk(KERN_ERR - "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", + "ioremap error for 0x%llx-0x%llx, requested 0x%x, got 0x%x\n", (unsigned long long)phys_addr, (unsigned long long)(phys_addr + size), - prot_val, new_prot_val); + pcm, new_pcm); goto err_free_memtype; } - prot_val = new_prot_val; + pcm = new_pcm; } - switch (prot_val) { - case _PAGE_CACHE_UC: + prot = PAGE_KERNEL_IO; + switch (pcm) { + case _PAGE_CACHE_MODE_UC: default: - prot = PAGE_KERNEL_IO_NOCACHE; + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_UC)); break; - case _PAGE_CACHE_UC_MINUS: - prot = PAGE_KERNEL_IO_UC_MINUS; + case _PAGE_CACHE_MODE_UC_MINUS: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); break; - case _PAGE_CACHE_WC: - prot = PAGE_KERNEL_IO_WC; + case _PAGE_CACHE_MODE_WC: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WC)); break; - case _PAGE_CACHE_WB: - prot = PAGE_KERNEL_IO; + case _PAGE_CACHE_MODE_WB: break; } @@ -178,7 +180,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, area->phys_addr = phys_addr; vaddr = (unsigned long) area->addr; - if (kernel_map_sync_memtype(phys_addr, size, prot_val)) + if (kernel_map_sync_memtype(phys_addr, size, pcm)) goto err_free_area; if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) @@ -227,14 +229,14 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) { /* * Ideally, this should be: - * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; + * pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS; * * Till we fix all X drivers to use ioremap_wc(), we will use * UC MINUS. */ - unsigned long val = _PAGE_CACHE_UC_MINUS; + enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS; - return __ioremap_caller(phys_addr, size, val, + return __ioremap_caller(phys_addr, size, pcm, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_nocache); @@ -252,7 +254,7 @@ EXPORT_SYMBOL(ioremap_nocache); void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) { if (pat_enabled) - return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, __builtin_return_address(0)); else return ioremap_nocache(phys_addr, size); @@ -261,7 +263,7 @@ EXPORT_SYMBOL(ioremap_wc); void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) { - return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB, + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_cache); @@ -269,7 +271,8 @@ EXPORT_SYMBOL(ioremap_cache); void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, unsigned long prot_val) { - return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK), + return __ioremap_caller(phys_addr, size, + pgprot2cachemode(__pgprot(prot_val)), __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_prot); @@ -327,7 +330,7 @@ EXPORT_SYMBOL(iounmap); * Convert a physical pointer to a virtual kernel pointer for /dev/mem * access */ -void *xlate_dev_mem_ptr(unsigned long phys) +void *xlate_dev_mem_ptr(phys_addr_t phys) { void *addr; unsigned long start = phys & PAGE_MASK; @@ -343,7 +346,7 @@ void *xlate_dev_mem_ptr(unsigned long phys) return addr; } -void unxlate_dev_mem_ptr(unsigned long phys, void *addr) +void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) { if (page_is_ram(phys >> PAGE_SHIFT)) return; diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 6b563a11889..62474ba66c8 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -16,4 +16,6 @@ void zone_sizes_init(void); extern int after_bootmem; +void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache); + #endif /* __X86_MM_INTERNAL_H */ diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c new file mode 100644 index 00000000000..67ebf575122 --- /dev/null +++ b/arch/x86/mm/mpx.c @@ -0,0 +1,928 @@ +/* + * mpx.c - Memory Protection eXtensions + * + * Copyright (c) 2014, Intel Corporation. + * Qiaowei Ren <qiaowei.ren@intel.com> + * Dave Hansen <dave.hansen@intel.com> + */ +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/sched/sysctl.h> + +#include <asm/i387.h> +#include <asm/insn.h> +#include <asm/mman.h> +#include <asm/mmu_context.h> +#include <asm/mpx.h> +#include <asm/processor.h> +#include <asm/fpu-internal.h> + +static const char *mpx_mapping_name(struct vm_area_struct *vma) +{ + return "[mpx]"; +} + +static struct vm_operations_struct mpx_vma_ops = { + .name = mpx_mapping_name, +}; + +static int is_mpx_vma(struct vm_area_struct *vma) +{ + return (vma->vm_ops == &mpx_vma_ops); +} + +/* + * This is really a simplified "vm_mmap". it only handles MPX + * bounds tables (the bounds directory is user-allocated). + * + * Later on, we use the vma->vm_ops to uniquely identify these + * VMAs. + */ +static unsigned long mpx_mmap(unsigned long len) +{ + unsigned long ret; + unsigned long addr, pgoff; + struct mm_struct *mm = current->mm; + vm_flags_t vm_flags; + struct vm_area_struct *vma; + + /* Only bounds table and bounds directory can be allocated here */ + if (len != MPX_BD_SIZE_BYTES && len != MPX_BT_SIZE_BYTES) + return -EINVAL; + + down_write(&mm->mmap_sem); + + /* Too many mappings? */ + if (mm->map_count > sysctl_max_map_count) { + ret = -ENOMEM; + goto out; + } + + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE); + if (addr & ~PAGE_MASK) { + ret = addr; + goto out; + } + + vm_flags = VM_READ | VM_WRITE | VM_MPX | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + + /* Set pgoff according to addr for anon_vma */ + pgoff = addr >> PAGE_SHIFT; + + ret = mmap_region(NULL, addr, len, vm_flags, pgoff); + if (IS_ERR_VALUE(ret)) + goto out; + + vma = find_vma(mm, ret); + if (!vma) { + ret = -ENOMEM; + goto out; + } + vma->vm_ops = &mpx_vma_ops; + + if (vm_flags & VM_LOCKED) { + up_write(&mm->mmap_sem); + mm_populate(ret, len); + return ret; + } + +out: + up_write(&mm->mmap_sem); + return ret; +} + +enum reg_type { + REG_TYPE_RM = 0, + REG_TYPE_INDEX, + REG_TYPE_BASE, +}; + +static int get_reg_offset(struct insn *insn, struct pt_regs *regs, + enum reg_type type) +{ + int regno = 0; + + static const int regoff[] = { + offsetof(struct pt_regs, ax), + offsetof(struct pt_regs, cx), + offsetof(struct pt_regs, dx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, sp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), +#ifdef CONFIG_X86_64 + offsetof(struct pt_regs, r8), + offsetof(struct pt_regs, r9), + offsetof(struct pt_regs, r10), + offsetof(struct pt_regs, r11), + offsetof(struct pt_regs, r12), + offsetof(struct pt_regs, r13), + offsetof(struct pt_regs, r14), + offsetof(struct pt_regs, r15), +#endif + }; + int nr_registers = ARRAY_SIZE(regoff); + /* + * Don't possibly decode a 32-bit instructions as + * reading a 64-bit-only register. + */ + if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64) + nr_registers -= 8; + + switch (type) { + case REG_TYPE_RM: + regno = X86_MODRM_RM(insn->modrm.value); + if (X86_REX_B(insn->rex_prefix.value) == 1) + regno += 8; + break; + + case REG_TYPE_INDEX: + regno = X86_SIB_INDEX(insn->sib.value); + if (X86_REX_X(insn->rex_prefix.value) == 1) + regno += 8; + break; + + case REG_TYPE_BASE: + regno = X86_SIB_BASE(insn->sib.value); + if (X86_REX_B(insn->rex_prefix.value) == 1) + regno += 8; + break; + + default: + pr_err("invalid register type"); + BUG(); + break; + } + + if (regno > nr_registers) { + WARN_ONCE(1, "decoded an instruction with an invalid register"); + return -EINVAL; + } + return regoff[regno]; +} + +/* + * return the address being referenced be instruction + * for rm=3 returning the content of the rm reg + * for rm!=3 calculates the address using SIB and Disp + */ +static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) +{ + unsigned long addr, base, indx; + int addr_offset, base_offset, indx_offset; + insn_byte_t sib; + + insn_get_modrm(insn); + insn_get_sib(insn); + sib = insn->sib.value; + + if (X86_MODRM_MOD(insn->modrm.value) == 3) { + addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); + if (addr_offset < 0) + goto out_err; + addr = regs_get_register(regs, addr_offset); + } else { + if (insn->sib.nbytes) { + base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE); + if (base_offset < 0) + goto out_err; + + indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX); + if (indx_offset < 0) + goto out_err; + + base = regs_get_register(regs, base_offset); + indx = regs_get_register(regs, indx_offset); + addr = base + indx * (1 << X86_SIB_SCALE(sib)); + } else { + addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); + if (addr_offset < 0) + goto out_err; + addr = regs_get_register(regs, addr_offset); + } + addr += insn->displacement.value; + } + return (void __user *)addr; +out_err: + return (void __user *)-1; +} + +static int mpx_insn_decode(struct insn *insn, + struct pt_regs *regs) +{ + unsigned char buf[MAX_INSN_SIZE]; + int x86_64 = !test_thread_flag(TIF_IA32); + int not_copied; + int nr_copied; + + not_copied = copy_from_user(buf, (void __user *)regs->ip, sizeof(buf)); + nr_copied = sizeof(buf) - not_copied; + /* + * The decoder _should_ fail nicely if we pass it a short buffer. + * But, let's not depend on that implementation detail. If we + * did not get anything, just error out now. + */ + if (!nr_copied) + return -EFAULT; + insn_init(insn, buf, nr_copied, x86_64); + insn_get_length(insn); + /* + * copy_from_user() tries to get as many bytes as we could see in + * the largest possible instruction. If the instruction we are + * after is shorter than that _and_ we attempt to copy from + * something unreadable, we might get a short read. This is OK + * as long as the read did not stop in the middle of the + * instruction. Check to see if we got a partial instruction. + */ + if (nr_copied < insn->length) + return -EFAULT; + + insn_get_opcode(insn); + /* + * We only _really_ need to decode bndcl/bndcn/bndcu + * Error out on anything else. + */ + if (insn->opcode.bytes[0] != 0x0f) + goto bad_opcode; + if ((insn->opcode.bytes[1] != 0x1a) && + (insn->opcode.bytes[1] != 0x1b)) + goto bad_opcode; + + return 0; +bad_opcode: + return -EINVAL; +} + +/* + * If a bounds overflow occurs then a #BR is generated. This + * function decodes MPX instructions to get violation address + * and set this address into extended struct siginfo. + * + * Note that this is not a super precise way of doing this. + * Userspace could have, by the time we get here, written + * anything it wants in to the instructions. We can not + * trust anything about it. They might not be valid + * instructions or might encode invalid registers, etc... + * + * The caller is expected to kfree() the returned siginfo_t. + */ +siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, + struct xsave_struct *xsave_buf) +{ + struct bndreg *bndregs, *bndreg; + siginfo_t *info = NULL; + struct insn insn; + uint8_t bndregno; + int err; + + err = mpx_insn_decode(&insn, regs); + if (err) + goto err_out; + + /* + * We know at this point that we are only dealing with + * MPX instructions. + */ + insn_get_modrm(&insn); + bndregno = X86_MODRM_REG(insn.modrm.value); + if (bndregno > 3) { + err = -EINVAL; + goto err_out; + } + /* get the bndregs _area_ of the xsave structure */ + bndregs = get_xsave_addr(xsave_buf, XSTATE_BNDREGS); + if (!bndregs) { + err = -EINVAL; + goto err_out; + } + /* now go select the individual register in the set of 4 */ + bndreg = &bndregs[bndregno]; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + err = -ENOMEM; + goto err_out; + } + /* + * The registers are always 64-bit, but the upper 32 + * bits are ignored in 32-bit mode. Also, note that the + * upper bounds are architecturally represented in 1's + * complement form. + * + * The 'unsigned long' cast is because the compiler + * complains when casting from integers to different-size + * pointers. + */ + info->si_lower = (void __user *)(unsigned long)bndreg->lower_bound; + info->si_upper = (void __user *)(unsigned long)~bndreg->upper_bound; + info->si_addr_lsb = 0; + info->si_signo = SIGSEGV; + info->si_errno = 0; + info->si_code = SEGV_BNDERR; + info->si_addr = mpx_get_addr_ref(&insn, regs); + /* + * We were not able to extract an address from the instruction, + * probably because there was something invalid in it. + */ + if (info->si_addr == (void *)-1) { + err = -EINVAL; + goto err_out; + } + return info; +err_out: + /* info might be NULL, but kfree() handles that */ + kfree(info); + return ERR_PTR(err); +} + +static __user void *task_get_bounds_dir(struct task_struct *tsk) +{ + struct bndcsr *bndcsr; + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) + return MPX_INVALID_BOUNDS_DIR; + + /* + * The bounds directory pointer is stored in a register + * only accessible if we first do an xsave. + */ + fpu_save_init(&tsk->thread.fpu); + bndcsr = get_xsave_addr(&tsk->thread.fpu.state->xsave, XSTATE_BNDCSR); + if (!bndcsr) + return MPX_INVALID_BOUNDS_DIR; + + /* + * Make sure the register looks valid by checking the + * enable bit. + */ + if (!(bndcsr->bndcfgu & MPX_BNDCFG_ENABLE_FLAG)) + return MPX_INVALID_BOUNDS_DIR; + + /* + * Lastly, mask off the low bits used for configuration + * flags, and return the address of the bounds table. + */ + return (void __user *)(unsigned long) + (bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK); +} + +int mpx_enable_management(struct task_struct *tsk) +{ + void __user *bd_base = MPX_INVALID_BOUNDS_DIR; + struct mm_struct *mm = tsk->mm; + int ret = 0; + + /* + * runtime in the userspace will be responsible for allocation of + * the bounds directory. Then, it will save the base of the bounds + * directory into XSAVE/XRSTOR Save Area and enable MPX through + * XRSTOR instruction. + * + * fpu_xsave() is expected to be very expensive. Storing the bounds + * directory here means that we do not have to do xsave in the unmap + * path; we can just use mm->bd_addr instead. + */ + bd_base = task_get_bounds_dir(tsk); + down_write(&mm->mmap_sem); + mm->bd_addr = bd_base; + if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR) + ret = -ENXIO; + + up_write(&mm->mmap_sem); + return ret; +} + +int mpx_disable_management(struct task_struct *tsk) +{ + struct mm_struct *mm = current->mm; + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) + return -ENXIO; + + down_write(&mm->mmap_sem); + mm->bd_addr = MPX_INVALID_BOUNDS_DIR; + up_write(&mm->mmap_sem); + return 0; +} + +/* + * With 32-bit mode, MPX_BT_SIZE_BYTES is 4MB, and the size of each + * bounds table is 16KB. With 64-bit mode, MPX_BT_SIZE_BYTES is 2GB, + * and the size of each bounds table is 4MB. + */ +static int allocate_bt(long __user *bd_entry) +{ + unsigned long expected_old_val = 0; + unsigned long actual_old_val = 0; + unsigned long bt_addr; + int ret = 0; + + /* + * Carve the virtual space out of userspace for the new + * bounds table: + */ + bt_addr = mpx_mmap(MPX_BT_SIZE_BYTES); + if (IS_ERR((void *)bt_addr)) + return PTR_ERR((void *)bt_addr); + /* + * Set the valid flag (kinda like _PAGE_PRESENT in a pte) + */ + bt_addr = bt_addr | MPX_BD_ENTRY_VALID_FLAG; + + /* + * Go poke the address of the new bounds table in to the + * bounds directory entry out in userspace memory. Note: + * we may race with another CPU instantiating the same table. + * In that case the cmpxchg will see an unexpected + * 'actual_old_val'. + * + * This can fault, but that's OK because we do not hold + * mmap_sem at this point, unlike some of the other part + * of the MPX code that have to pagefault_disable(). + */ + ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry, + expected_old_val, bt_addr); + if (ret) + goto out_unmap; + + /* + * The user_atomic_cmpxchg_inatomic() will only return nonzero + * for faults, *not* if the cmpxchg itself fails. Now we must + * verify that the cmpxchg itself completed successfully. + */ + /* + * We expected an empty 'expected_old_val', but instead found + * an apparently valid entry. Assume we raced with another + * thread to instantiate this table and desclare succecss. + */ + if (actual_old_val & MPX_BD_ENTRY_VALID_FLAG) { + ret = 0; + goto out_unmap; + } + /* + * We found a non-empty bd_entry but it did not have the + * VALID_FLAG set. Return an error which will result in + * a SEGV since this probably means that somebody scribbled + * some invalid data in to a bounds table. + */ + if (expected_old_val != actual_old_val) { + ret = -EINVAL; + goto out_unmap; + } + return 0; +out_unmap: + vm_munmap(bt_addr & MPX_BT_ADDR_MASK, MPX_BT_SIZE_BYTES); + return ret; +} + +/* + * When a BNDSTX instruction attempts to save bounds to a bounds + * table, it will first attempt to look up the table in the + * first-level bounds directory. If it does not find a table in + * the directory, a #BR is generated and we get here in order to + * allocate a new table. + * + * With 32-bit mode, the size of BD is 4MB, and the size of each + * bound table is 16KB. With 64-bit mode, the size of BD is 2GB, + * and the size of each bound table is 4MB. + */ +static int do_mpx_bt_fault(struct xsave_struct *xsave_buf) +{ + unsigned long bd_entry, bd_base; + struct bndcsr *bndcsr; + + bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); + if (!bndcsr) + return -EINVAL; + /* + * Mask off the preserve and enable bits + */ + bd_base = bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK; + /* + * The hardware provides the address of the missing or invalid + * entry via BNDSTATUS, so we don't have to go look it up. + */ + bd_entry = bndcsr->bndstatus & MPX_BNDSTA_ADDR_MASK; + /* + * Make sure the directory entry is within where we think + * the directory is. + */ + if ((bd_entry < bd_base) || + (bd_entry >= bd_base + MPX_BD_SIZE_BYTES)) + return -EINVAL; + + return allocate_bt((long __user *)bd_entry); +} + +int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) +{ + /* + * Userspace never asked us to manage the bounds tables, + * so refuse to help. + */ + if (!kernel_managing_mpx_tables(current->mm)) + return -EINVAL; + + if (do_mpx_bt_fault(xsave_buf)) { + force_sig(SIGSEGV, current); + /* + * The force_sig() is essentially "handling" this + * exception, so we do not pass up the error + * from do_mpx_bt_fault(). + */ + } + return 0; +} + +/* + * A thin wrapper around get_user_pages(). Returns 0 if the + * fault was resolved or -errno if not. + */ +static int mpx_resolve_fault(long __user *addr, int write) +{ + long gup_ret; + int nr_pages = 1; + int force = 0; + + gup_ret = get_user_pages(current, current->mm, (unsigned long)addr, + nr_pages, write, force, NULL, NULL); + /* + * get_user_pages() returns number of pages gotten. + * 0 means we failed to fault in and get anything, + * probably because 'addr' is bad. + */ + if (!gup_ret) + return -EFAULT; + /* Other error, return it */ + if (gup_ret < 0) + return gup_ret; + /* must have gup'd a page and gup_ret>0, success */ + return 0; +} + +/* + * Get the base of bounds tables pointed by specific bounds + * directory entry. + */ +static int get_bt_addr(struct mm_struct *mm, + long __user *bd_entry, unsigned long *bt_addr) +{ + int ret; + int valid_bit; + + if (!access_ok(VERIFY_READ, (bd_entry), sizeof(*bd_entry))) + return -EFAULT; + + while (1) { + int need_write = 0; + + pagefault_disable(); + ret = get_user(*bt_addr, bd_entry); + pagefault_enable(); + if (!ret) + break; + if (ret == -EFAULT) + ret = mpx_resolve_fault(bd_entry, need_write); + /* + * If we could not resolve the fault, consider it + * userspace's fault and error out. + */ + if (ret) + return ret; + } + + valid_bit = *bt_addr & MPX_BD_ENTRY_VALID_FLAG; + *bt_addr &= MPX_BT_ADDR_MASK; + + /* + * When the kernel is managing bounds tables, a bounds directory + * entry will either have a valid address (plus the valid bit) + * *OR* be completely empty. If we see a !valid entry *and* some + * data in the address field, we know something is wrong. This + * -EINVAL return will cause a SIGSEGV. + */ + if (!valid_bit && *bt_addr) + return -EINVAL; + /* + * Do we have an completely zeroed bt entry? That is OK. It + * just means there was no bounds table for this memory. Make + * sure to distinguish this from -EINVAL, which will cause + * a SEGV. + */ + if (!valid_bit) + return -ENOENT; + + return 0; +} + +/* + * Free the backing physical pages of bounds table 'bt_addr'. + * Assume start...end is within that bounds table. + */ +static int zap_bt_entries(struct mm_struct *mm, + unsigned long bt_addr, + unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + unsigned long addr, len; + + /* + * Find the first overlapping vma. If vma->vm_start > start, there + * will be a hole in the bounds table. This -EINVAL return will + * cause a SIGSEGV. + */ + vma = find_vma(mm, start); + if (!vma || vma->vm_start > start) + return -EINVAL; + + /* + * A NUMA policy on a VM_MPX VMA could cause this bouds table to + * be split. So we need to look across the entire 'start -> end' + * range of this bounds table, find all of the VM_MPX VMAs, and + * zap only those. + */ + addr = start; + while (vma && vma->vm_start < end) { + /* + * We followed a bounds directory entry down + * here. If we find a non-MPX VMA, that's bad, + * so stop immediately and return an error. This + * probably results in a SIGSEGV. + */ + if (!is_mpx_vma(vma)) + return -EINVAL; + + len = min(vma->vm_end, end) - addr; + zap_page_range(vma, addr, len, NULL); + + vma = vma->vm_next; + addr = vma->vm_start; + } + + return 0; +} + +static int unmap_single_bt(struct mm_struct *mm, + long __user *bd_entry, unsigned long bt_addr) +{ + unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG; + unsigned long actual_old_val = 0; + int ret; + + while (1) { + int need_write = 1; + + pagefault_disable(); + ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry, + expected_old_val, 0); + pagefault_enable(); + if (!ret) + break; + if (ret == -EFAULT) + ret = mpx_resolve_fault(bd_entry, need_write); + /* + * If we could not resolve the fault, consider it + * userspace's fault and error out. + */ + if (ret) + return ret; + } + /* + * The cmpxchg was performed, check the results. + */ + if (actual_old_val != expected_old_val) { + /* + * Someone else raced with us to unmap the table. + * There was no bounds table pointed to by the + * directory, so declare success. Somebody freed + * it. + */ + if (!actual_old_val) + return 0; + /* + * Something messed with the bounds directory + * entry. We hold mmap_sem for read or write + * here, so it could not be a _new_ bounds table + * that someone just allocated. Something is + * wrong, so pass up the error and SIGSEGV. + */ + return -EINVAL; + } + + /* + * Note, we are likely being called under do_munmap() already. To + * avoid recursion, do_munmap() will check whether it comes + * from one bounds table through VM_MPX flag. + */ + return do_munmap(mm, bt_addr, MPX_BT_SIZE_BYTES); +} + +/* + * If the bounds table pointed by bounds directory 'bd_entry' is + * not shared, unmap this whole bounds table. Otherwise, only free + * those backing physical pages of bounds table entries covered + * in this virtual address region start...end. + */ +static int unmap_shared_bt(struct mm_struct *mm, + long __user *bd_entry, unsigned long start, + unsigned long end, bool prev_shared, bool next_shared) +{ + unsigned long bt_addr; + int ret; + + ret = get_bt_addr(mm, bd_entry, &bt_addr); + /* + * We could see an "error" ret for not-present bounds + * tables (not really an error), or actual errors, but + * stop unmapping either way. + */ + if (ret) + return ret; + + if (prev_shared && next_shared) + ret = zap_bt_entries(mm, bt_addr, + bt_addr+MPX_GET_BT_ENTRY_OFFSET(start), + bt_addr+MPX_GET_BT_ENTRY_OFFSET(end)); + else if (prev_shared) + ret = zap_bt_entries(mm, bt_addr, + bt_addr+MPX_GET_BT_ENTRY_OFFSET(start), + bt_addr+MPX_BT_SIZE_BYTES); + else if (next_shared) + ret = zap_bt_entries(mm, bt_addr, bt_addr, + bt_addr+MPX_GET_BT_ENTRY_OFFSET(end)); + else + ret = unmap_single_bt(mm, bd_entry, bt_addr); + + return ret; +} + +/* + * A virtual address region being munmap()ed might share bounds table + * with adjacent VMAs. We only need to free the backing physical + * memory of these shared bounds tables entries covered in this virtual + * address region. + */ +static int unmap_edge_bts(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + int ret; + long __user *bde_start, *bde_end; + struct vm_area_struct *prev, *next; + bool prev_shared = false, next_shared = false; + + bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start); + bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1); + + /* + * Check whether bde_start and bde_end are shared with adjacent + * VMAs. + * + * We already unliked the VMAs from the mm's rbtree so 'start' + * is guaranteed to be in a hole. This gets us the first VMA + * before the hole in to 'prev' and the next VMA after the hole + * in to 'next'. + */ + next = find_vma_prev(mm, start, &prev); + if (prev && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(prev->vm_end-1)) + == bde_start) + prev_shared = true; + if (next && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(next->vm_start)) + == bde_end) + next_shared = true; + + /* + * This virtual address region being munmap()ed is only + * covered by one bounds table. + * + * In this case, if this table is also shared with adjacent + * VMAs, only part of the backing physical memory of the bounds + * table need be freeed. Otherwise the whole bounds table need + * be unmapped. + */ + if (bde_start == bde_end) { + return unmap_shared_bt(mm, bde_start, start, end, + prev_shared, next_shared); + } + + /* + * If more than one bounds tables are covered in this virtual + * address region being munmap()ed, we need to separately check + * whether bde_start and bde_end are shared with adjacent VMAs. + */ + ret = unmap_shared_bt(mm, bde_start, start, end, prev_shared, false); + if (ret) + return ret; + ret = unmap_shared_bt(mm, bde_end, start, end, false, next_shared); + if (ret) + return ret; + + return 0; +} + +static int mpx_unmap_tables(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + int ret; + long __user *bd_entry, *bde_start, *bde_end; + unsigned long bt_addr; + + /* + * "Edge" bounds tables are those which are being used by the region + * (start -> end), but that may be shared with adjacent areas. If they + * turn out to be completely unshared, they will be freed. If they are + * shared, we will free the backing store (like an MADV_DONTNEED) for + * areas used by this region. + */ + ret = unmap_edge_bts(mm, start, end); + switch (ret) { + /* non-present tables are OK */ + case 0: + case -ENOENT: + /* Success, or no tables to unmap */ + break; + case -EINVAL: + case -EFAULT: + default: + return ret; + } + + /* + * Only unmap the bounds table that are + * 1. fully covered + * 2. not at the edges of the mapping, even if full aligned + */ + bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start); + bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1); + for (bd_entry = bde_start + 1; bd_entry < bde_end; bd_entry++) { + ret = get_bt_addr(mm, bd_entry, &bt_addr); + switch (ret) { + case 0: + break; + case -ENOENT: + /* No table here, try the next one */ + continue; + case -EINVAL: + case -EFAULT: + default: + /* + * Note: we are being strict here. + * Any time we run in to an issue + * unmapping tables, we stop and + * SIGSEGV. + */ + return ret; + } + + ret = unmap_single_bt(mm, bd_entry, bt_addr); + if (ret) + return ret; + } + + return 0; +} + +/* + * Free unused bounds tables covered in a virtual address region being + * munmap()ed. Assume end > start. + * + * This function will be called by do_munmap(), and the VMAs covering + * the virtual address region start...end have already been split if + * necessary, and the 'vma' is the first vma in this range (start -> end). + */ +void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + int ret; + + /* + * Refuse to do anything unless userspace has asked + * the kernel to help manage the bounds tables, + */ + if (!kernel_managing_mpx_tables(current->mm)) + return; + /* + * This will look across the entire 'start -> end' range, + * and find all of the non-VM_MPX VMAs. + * + * To avoid recursion, if a VM_MPX vma is found in the range + * (start->end), we will not continue follow-up work. This + * recursion represents having bounds tables for bounds tables, + * which should not occur normally. Being strict about it here + * helps ensure that we do not have an exploitable stack overflow. + */ + do { + if (vma->vm_flags & VM_MPX) + return; + vma = vma->vm_next; + } while (vma && vma->vm_start < end); + + ret = mpx_unmap_tables(mm, start, end); + if (ret) + force_sig(SIGSEGV, current); +} diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index ae242a7c11c..a3a5d46605d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -409,7 +409,7 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr) psize = page_level_size(level); pmask = page_level_mask(level); offset = virt_addr & ~pmask; - phys_addr = pte_pfn(*pte) << PAGE_SHIFT; + phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; return (phys_addr | offset); } EXPORT_SYMBOL_GPL(slow_virt_to_phys); @@ -485,14 +485,23 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, /* * We are safe now. Check whether the new pgprot is the same: + * Convert protection attributes to 4k-format, as cpa->mask* are set + * up accordingly. */ old_pte = *kpte; - old_prot = req_prot = pte_pgprot(old_pte); + old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte)); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); /* + * req_prot is in format of 4k pages. It must be converted to large + * page format: the caching mode includes the PAT bit located at + * different bit positions in the two formats. + */ + req_prot = pgprot_4k_2_large(req_prot); + + /* * Set the PSE and GLOBAL flags only if the PRESENT flag is * set otherwise pmd_present/pmd_huge will return true even on * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL @@ -585,13 +594,10 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); - /* - * If we ever want to utilize the PAT bit, we need to - * update this function to make sure it's converted from - * bit 12 to bit 7 when we cross from the 2MB level to - * the 4K level: - */ - WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE); + + /* promote PAT bit to correct position */ + if (level == PG_LEVEL_2M) + ref_prot = pgprot_large_2_4k(ref_prot); #ifdef CONFIG_X86_64 if (level == PG_LEVEL_1G) { @@ -879,6 +885,7 @@ static int populate_pmd(struct cpa_data *cpa, { unsigned int cur_pages = 0; pmd_t *pmd; + pgprot_t pmd_pgprot; /* * Not on a 2M boundary? @@ -910,6 +917,8 @@ static int populate_pmd(struct cpa_data *cpa, if (num_pages == cur_pages) return cur_pages; + pmd_pgprot = pgprot_4k_2_large(pgprot); + while (end - start >= PMD_SIZE) { /* @@ -921,7 +930,8 @@ static int populate_pmd(struct cpa_data *cpa, pmd = pmd_offset(pud, start); - set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); + set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | + massage_pgprot(pmd_pgprot))); start += PMD_SIZE; cpa->pfn += PMD_SIZE; @@ -949,6 +959,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, pud_t *pud; unsigned long end; int cur_pages = 0; + pgprot_t pud_pgprot; end = start + (cpa->numpages << PAGE_SHIFT); @@ -986,12 +997,14 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, return cur_pages; pud = pud_offset(pgd, start); + pud_pgprot = pgprot_4k_2_large(pgprot); /* * Map everything starting from the Gb boundary, possibly with 1G pages */ while (end - start >= PUD_SIZE) { - set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); + set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | + massage_pgprot(pud_pgprot))); start += PUD_SIZE; cpa->pfn += PUD_SIZE; @@ -1304,12 +1317,6 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) return 0; } -static inline int cache_attr(pgprot_t attr) -{ - return pgprot_val(attr) & - (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); -} - static int change_page_attr_set_clr(unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, int force_split, int in_flag, @@ -1390,7 +1397,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, * No need to flush, when we did not set any of the caching * attributes: */ - cache = cache_attr(mask_set); + cache = !!pgprot2cachemode(mask_set); /* * On success we use CLFLUSH, when the CPU supports it to @@ -1445,7 +1452,8 @@ int _set_memory_uc(unsigned long addr, int numpages) * for now UC MINUS. see comments in ioremap_nocache() */ return change_page_attr_set(&addr, numpages, - __pgprot(_PAGE_CACHE_UC_MINUS), 0); + cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), + 0); } int set_memory_uc(unsigned long addr, int numpages) @@ -1456,7 +1464,7 @@ int set_memory_uc(unsigned long addr, int numpages) * for now UC MINUS. see comments in ioremap_nocache() */ ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, - _PAGE_CACHE_UC_MINUS, NULL); + _PAGE_CACHE_MODE_UC_MINUS, NULL); if (ret) goto out_err; @@ -1474,7 +1482,7 @@ out_err: EXPORT_SYMBOL(set_memory_uc); static int _set_memory_array(unsigned long *addr, int addrinarray, - unsigned long new_type) + enum page_cache_mode new_type) { int i, j; int ret; @@ -1490,11 +1498,13 @@ static int _set_memory_array(unsigned long *addr, int addrinarray, } ret = change_page_attr_set(addr, addrinarray, - __pgprot(_PAGE_CACHE_UC_MINUS), 1); + cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), + 1); - if (!ret && new_type == _PAGE_CACHE_WC) + if (!ret && new_type == _PAGE_CACHE_MODE_WC) ret = change_page_attr_set_clr(addr, addrinarray, - __pgprot(_PAGE_CACHE_WC), + cachemode2pgprot( + _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, CPA_ARRAY, NULL); if (ret) @@ -1511,13 +1521,13 @@ out_free: int set_memory_array_uc(unsigned long *addr, int addrinarray) { - return _set_memory_array(addr, addrinarray, _PAGE_CACHE_UC_MINUS); + return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); } EXPORT_SYMBOL(set_memory_array_uc); int set_memory_array_wc(unsigned long *addr, int addrinarray) { - return _set_memory_array(addr, addrinarray, _PAGE_CACHE_WC); + return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC); } EXPORT_SYMBOL(set_memory_array_wc); @@ -1527,10 +1537,12 @@ int _set_memory_wc(unsigned long addr, int numpages) unsigned long addr_copy = addr; ret = change_page_attr_set(&addr, numpages, - __pgprot(_PAGE_CACHE_UC_MINUS), 0); + cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), + 0); if (!ret) { ret = change_page_attr_set_clr(&addr_copy, numpages, - __pgprot(_PAGE_CACHE_WC), + cachemode2pgprot( + _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, 0, NULL); } @@ -1545,7 +1557,7 @@ int set_memory_wc(unsigned long addr, int numpages) return set_memory_uc(addr, numpages); ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, - _PAGE_CACHE_WC, NULL); + _PAGE_CACHE_MODE_WC, NULL); if (ret) goto out_err; @@ -1564,6 +1576,7 @@ EXPORT_SYMBOL(set_memory_wc); int _set_memory_wb(unsigned long addr, int numpages) { + /* WB cache mode is hard wired to all cache attribute bits being 0 */ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_CACHE_MASK), 0); } @@ -1586,6 +1599,7 @@ int set_memory_array_wb(unsigned long *addr, int addrinarray) int i; int ret; + /* WB cache mode is hard wired to all cache attribute bits being 0 */ ret = change_page_attr_clear(addr, addrinarray, __pgprot(_PAGE_CACHE_MASK), 1); if (ret) @@ -1648,7 +1662,7 @@ int set_pages_uc(struct page *page, int numpages) EXPORT_SYMBOL(set_pages_uc); static int _set_pages_array(struct page **pages, int addrinarray, - unsigned long new_type) + enum page_cache_mode new_type) { unsigned long start; unsigned long end; @@ -1666,10 +1680,11 @@ static int _set_pages_array(struct page **pages, int addrinarray, } ret = cpa_set_pages_array(pages, addrinarray, - __pgprot(_PAGE_CACHE_UC_MINUS)); - if (!ret && new_type == _PAGE_CACHE_WC) + cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS)); + if (!ret && new_type == _PAGE_CACHE_MODE_WC) ret = change_page_attr_set_clr(NULL, addrinarray, - __pgprot(_PAGE_CACHE_WC), + cachemode2pgprot( + _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, CPA_PAGES_ARRAY, pages); if (ret) @@ -1689,13 +1704,13 @@ err_out: int set_pages_array_uc(struct page **pages, int addrinarray) { - return _set_pages_array(pages, addrinarray, _PAGE_CACHE_UC_MINUS); + return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); } EXPORT_SYMBOL(set_pages_array_uc); int set_pages_array_wc(struct page **pages, int addrinarray) { - return _set_pages_array(pages, addrinarray, _PAGE_CACHE_WC); + return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC); } EXPORT_SYMBOL(set_pages_array_wc); @@ -1714,6 +1729,7 @@ int set_pages_array_wb(struct page **pages, int addrinarray) unsigned long end; int i; + /* WB cache mode is hard wired to all cache attribute bits being 0 */ retval = cpa_clear_pages_array(pages, addrinarray, __pgprot(_PAGE_CACHE_MASK)); if (retval) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 657438858e8..edf299c8ff6 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -31,6 +31,7 @@ #include <asm/io.h> #include "pat_internal.h" +#include "mm_internal.h" #ifdef CONFIG_X86_PAT int __read_mostly pat_enabled = 1; @@ -66,6 +67,75 @@ __setup("debugpat", pat_debug_setup); static u64 __read_mostly boot_pat_state; +#ifdef CONFIG_X86_PAT +/* + * X86 PAT uses page flags WC and Uncached together to keep track of + * memory type of pages that have backing page struct. X86 PAT supports 3 + * different memory types, _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC and + * _PAGE_CACHE_MODE_UC_MINUS and fourth state where page's memory type has not + * been changed from its default (value of -1 used to denote this). + * Note we do not support _PAGE_CACHE_MODE_UC here. + */ + +#define _PGMT_DEFAULT 0 +#define _PGMT_WC (1UL << PG_arch_1) +#define _PGMT_UC_MINUS (1UL << PG_uncached) +#define _PGMT_WB (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_CLEAR_MASK (~_PGMT_MASK) + +static inline enum page_cache_mode get_page_memtype(struct page *pg) +{ + unsigned long pg_flags = pg->flags & _PGMT_MASK; + + if (pg_flags == _PGMT_DEFAULT) + return -1; + else if (pg_flags == _PGMT_WC) + return _PAGE_CACHE_MODE_WC; + else if (pg_flags == _PGMT_UC_MINUS) + return _PAGE_CACHE_MODE_UC_MINUS; + else + return _PAGE_CACHE_MODE_WB; +} + +static inline void set_page_memtype(struct page *pg, + enum page_cache_mode memtype) +{ + unsigned long memtype_flags; + unsigned long old_flags; + unsigned long new_flags; + + switch (memtype) { + case _PAGE_CACHE_MODE_WC: + memtype_flags = _PGMT_WC; + break; + case _PAGE_CACHE_MODE_UC_MINUS: + memtype_flags = _PGMT_UC_MINUS; + break; + case _PAGE_CACHE_MODE_WB: + memtype_flags = _PGMT_WB; + break; + default: + memtype_flags = _PGMT_DEFAULT; + break; + } + + do { + old_flags = pg->flags; + new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; + } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags); +} +#else +static inline enum page_cache_mode get_page_memtype(struct page *pg) +{ + return -1; +} +static inline void set_page_memtype(struct page *pg, + enum page_cache_mode memtype) +{ +} +#endif + enum { PAT_UC = 0, /* uncached */ PAT_WC = 1, /* Write combining */ @@ -75,6 +145,52 @@ enum { PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ }; +#define CM(c) (_PAGE_CACHE_MODE_ ## c) + +static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) +{ + enum page_cache_mode cache; + char *cache_mode; + + switch (pat_val) { + case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; + case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; + case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; + case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; + case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; + case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; + default: cache = CM(WB); cache_mode = "WB "; break; + } + + memcpy(msg, cache_mode, 4); + + return cache; +} + +#undef CM + +/* + * Update the cache mode to pgprot translation tables according to PAT + * configuration. + * Using lower indices is preferred, so we start with highest index. + */ +void pat_init_cache_modes(void) +{ + int i; + enum page_cache_mode cache; + char pat_msg[33]; + u64 pat; + + rdmsrl(MSR_IA32_CR_PAT, pat); + pat_msg[32] = 0; + for (i = 7; i >= 0; i--) { + cache = pat_get_cache_mode((pat >> (i * 8)) & 7, + pat_msg + 4 * i); + update_cache_mode_entry(i, cache); + } + pr_info("PAT configuration [0-7]: %s\n", pat_msg); +} + #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) void pat_init(void) @@ -124,8 +240,7 @@ void pat_init(void) wrmsrl(MSR_IA32_CR_PAT, pat); if (boot_cpu) - printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", - smp_processor_id(), boot_pat_state, pat); + pat_init_cache_modes(); } #undef PAT @@ -139,20 +254,21 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ * The intersection is based on "Effective Memory Type" tables in IA-32 * SDM vol 3a */ -static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) +static unsigned long pat_x_mtrr_type(u64 start, u64 end, + enum page_cache_mode req_type) { /* * Look for MTRR hint to get the effective type in case where PAT * request is for WB. */ - if (req_type == _PAGE_CACHE_WB) { + if (req_type == _PAGE_CACHE_MODE_WB) { u8 mtrr_type; mtrr_type = mtrr_type_lookup(start, end); if (mtrr_type != MTRR_TYPE_WRBACK) - return _PAGE_CACHE_UC_MINUS; + return _PAGE_CACHE_MODE_UC_MINUS; - return _PAGE_CACHE_WB; + return _PAGE_CACHE_MODE_WB; } return req_type; @@ -207,25 +323,26 @@ static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) * - Find the memtype of all the pages in the range, look for any conflicts * - In case of no conflicts, set the new memtype for pages in the range */ -static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, - unsigned long *new_type) +static int reserve_ram_pages_type(u64 start, u64 end, + enum page_cache_mode req_type, + enum page_cache_mode *new_type) { struct page *page; u64 pfn; - if (req_type == _PAGE_CACHE_UC) { + if (req_type == _PAGE_CACHE_MODE_UC) { /* We do not support strong UC */ WARN_ON_ONCE(1); - req_type = _PAGE_CACHE_UC_MINUS; + req_type = _PAGE_CACHE_MODE_UC_MINUS; } for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { - unsigned long type; + enum page_cache_mode type; page = pfn_to_page(pfn); type = get_page_memtype(page); if (type != -1) { - printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n", + pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", start, end - 1, type, req_type); if (new_type) *new_type = type; @@ -258,21 +375,21 @@ static int free_ram_pages_type(u64 start, u64 end) /* * req_type typically has one of the: - * - _PAGE_CACHE_WB - * - _PAGE_CACHE_WC - * - _PAGE_CACHE_UC_MINUS - * - _PAGE_CACHE_UC + * - _PAGE_CACHE_MODE_WB + * - _PAGE_CACHE_MODE_WC + * - _PAGE_CACHE_MODE_UC_MINUS + * - _PAGE_CACHE_MODE_UC * * If new_type is NULL, function will return an error if it cannot reserve the * region with req_type. If new_type is non-NULL, function will return * available type in new_type in case of no error. In case of any error * it will return a negative return value. */ -int reserve_memtype(u64 start, u64 end, unsigned long req_type, - unsigned long *new_type) +int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, + enum page_cache_mode *new_type) { struct memtype *new; - unsigned long actual_type; + enum page_cache_mode actual_type; int is_range_ram; int err = 0; @@ -281,10 +398,10 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (!pat_enabled) { /* This is identical to page table setting without PAT */ if (new_type) { - if (req_type == _PAGE_CACHE_WC) - *new_type = _PAGE_CACHE_UC_MINUS; + if (req_type == _PAGE_CACHE_MODE_WC) + *new_type = _PAGE_CACHE_MODE_UC_MINUS; else - *new_type = req_type & _PAGE_CACHE_MASK; + *new_type = req_type; } return 0; } @@ -292,7 +409,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, /* Low ISA region is always mapped WB in page table. No need to track */ if (x86_platform.is_untracked_pat_range(start, end)) { if (new_type) - *new_type = _PAGE_CACHE_WB; + *new_type = _PAGE_CACHE_MODE_WB; return 0; } @@ -302,7 +419,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, * tools and ACPI tools). Use WB request for WB memory and use * UC_MINUS otherwise. */ - actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK); + actual_type = pat_x_mtrr_type(start, end, req_type); if (new_type) *new_type = actual_type; @@ -394,12 +511,12 @@ int free_memtype(u64 start, u64 end) * * Only to be called when PAT is enabled * - * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or - * _PAGE_CACHE_UC + * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS + * or _PAGE_CACHE_MODE_UC */ -static unsigned long lookup_memtype(u64 paddr) +static enum page_cache_mode lookup_memtype(u64 paddr) { - int rettype = _PAGE_CACHE_WB; + enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB; struct memtype *entry; if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) @@ -414,7 +531,7 @@ static unsigned long lookup_memtype(u64 paddr) * default state and not reserved, and hence of type WB */ if (rettype == -1) - rettype = _PAGE_CACHE_WB; + rettype = _PAGE_CACHE_MODE_WB; return rettype; } @@ -425,7 +542,7 @@ static unsigned long lookup_memtype(u64 paddr) if (entry != NULL) rettype = entry->type; else - rettype = _PAGE_CACHE_UC_MINUS; + rettype = _PAGE_CACHE_MODE_UC_MINUS; spin_unlock(&memtype_lock); return rettype; @@ -442,11 +559,11 @@ static unsigned long lookup_memtype(u64 paddr) * On failure, returns non-zero */ int io_reserve_memtype(resource_size_t start, resource_size_t end, - unsigned long *type) + enum page_cache_mode *type) { resource_size_t size = end - start; - unsigned long req_type = *type; - unsigned long new_type; + enum page_cache_mode req_type = *type; + enum page_cache_mode new_type; int ret; WARN_ON_ONCE(iomem_map_sanity_check(start, size)); @@ -520,13 +637,13 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { - unsigned long flags = _PAGE_CACHE_WB; + enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; if (!range_is_allowed(pfn, size)) return 0; if (file->f_flags & O_DSYNC) - flags = _PAGE_CACHE_UC_MINUS; + pcm = _PAGE_CACHE_MODE_UC_MINUS; #ifdef CONFIG_X86_32 /* @@ -543,12 +660,12 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, boot_cpu_has(X86_FEATURE_CYRIX_ARR) || boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && (pfn << PAGE_SHIFT) >= __pa(high_memory)) { - flags = _PAGE_CACHE_UC; + pcm = _PAGE_CACHE_MODE_UC; } #endif *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | - flags); + cachemode2protval(pcm)); return 1; } @@ -556,7 +673,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, * Change the memory type for the physial address range in kernel identity * mapping space if that range is a part of identity map. */ -int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) +int kernel_map_sync_memtype(u64 base, unsigned long size, + enum page_cache_mode pcm) { unsigned long id_sz; @@ -574,11 +692,11 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) __pa(high_memory) - base : size; - if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) { + if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { printk(KERN_INFO "%s:%d ioremap_change_attr failed %s " "for [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, - cattr_name(flags), + cattr_name(pcm), base, (unsigned long long)(base + size-1)); return -EINVAL; } @@ -595,8 +713,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, { int is_ram = 0; int ret; - unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); - unsigned long flags = want_flags; + enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot); + enum page_cache_mode pcm = want_pcm; is_ram = pat_pagerange_is_ram(paddr, paddr + size); @@ -609,36 +727,36 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, if (!pat_enabled) return 0; - flags = lookup_memtype(paddr); - if (want_flags != flags) { + pcm = lookup_memtype(paddr); + if (want_pcm != pcm) { printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, - cattr_name(want_flags), + cattr_name(want_pcm), (unsigned long long)paddr, (unsigned long long)(paddr + size - 1), - cattr_name(flags)); + cattr_name(pcm)); *vma_prot = __pgprot((pgprot_val(*vma_prot) & - (~_PAGE_CACHE_MASK)) | - flags); + (~_PAGE_CACHE_MASK)) | + cachemode2protval(pcm)); } return 0; } - ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); + ret = reserve_memtype(paddr, paddr + size, want_pcm, &pcm); if (ret) return ret; - if (flags != want_flags) { + if (pcm != want_pcm) { if (strict_prot || - !is_new_memtype_allowed(paddr, size, want_flags, flags)) { + !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { free_memtype(paddr, paddr + size); printk(KERN_ERR "%s:%d map pfn expected mapping type %s" " for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, - cattr_name(want_flags), + cattr_name(want_pcm), (unsigned long long)paddr, (unsigned long long)(paddr + size - 1), - cattr_name(flags)); + cattr_name(pcm)); return -EINVAL; } /* @@ -647,10 +765,10 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, */ *vma_prot = __pgprot((pgprot_val(*vma_prot) & (~_PAGE_CACHE_MASK)) | - flags); + cachemode2protval(pcm)); } - if (kernel_map_sync_memtype(paddr, size, flags) < 0) { + if (kernel_map_sync_memtype(paddr, size, pcm) < 0) { free_memtype(paddr, paddr + size); return -EINVAL; } @@ -709,7 +827,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) { resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; - unsigned long flags; + enum page_cache_mode pcm; /* reserve the whole chunk starting from paddr */ if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { @@ -728,18 +846,18 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, * For anything smaller than the vma size we set prot based on the * lookup. */ - flags = lookup_memtype(paddr); + pcm = lookup_memtype(paddr); /* Check memtype for the remaining pages */ while (size > PAGE_SIZE) { size -= PAGE_SIZE; paddr += PAGE_SIZE; - if (flags != lookup_memtype(paddr)) + if (pcm != lookup_memtype(paddr)) return -EINVAL; } *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | - flags); + cachemode2protval(pcm)); return 0; } @@ -747,15 +865,15 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn) { - unsigned long flags; + enum page_cache_mode pcm; if (!pat_enabled) return 0; /* Set prot based on lookup */ - flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); + pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | - flags); + cachemode2protval(pcm)); return 0; } @@ -791,7 +909,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, pgprot_t pgprot_writecombine(pgprot_t prot) { if (pat_enabled) - return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC); + return __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WC)); else return pgprot_noncached(prot); } @@ -824,7 +943,7 @@ static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos == 0) { ++*pos; - seq_printf(seq, "PAT memtype list:\n"); + seq_puts(seq, "PAT memtype list:\n"); } return memtype_get_idx(*pos); diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h index 77e5ba153fa..f6411620305 100644 --- a/arch/x86/mm/pat_internal.h +++ b/arch/x86/mm/pat_internal.h @@ -10,30 +10,32 @@ struct memtype { u64 start; u64 end; u64 subtree_max_end; - unsigned long type; + enum page_cache_mode type; struct rb_node rb; }; -static inline char *cattr_name(unsigned long flags) +static inline char *cattr_name(enum page_cache_mode pcm) { - switch (flags & _PAGE_CACHE_MASK) { - case _PAGE_CACHE_UC: return "uncached"; - case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; - case _PAGE_CACHE_WB: return "write-back"; - case _PAGE_CACHE_WC: return "write-combining"; - default: return "broken"; + switch (pcm) { + case _PAGE_CACHE_MODE_UC: return "uncached"; + case _PAGE_CACHE_MODE_UC_MINUS: return "uncached-minus"; + case _PAGE_CACHE_MODE_WB: return "write-back"; + case _PAGE_CACHE_MODE_WC: return "write-combining"; + case _PAGE_CACHE_MODE_WT: return "write-through"; + case _PAGE_CACHE_MODE_WP: return "write-protected"; + default: return "broken"; } } #ifdef CONFIG_X86_PAT extern int rbt_memtype_check_insert(struct memtype *new, - unsigned long *new_type); + enum page_cache_mode *new_type); extern struct memtype *rbt_memtype_erase(u64 start, u64 end); extern struct memtype *rbt_memtype_lookup(u64 addr); extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos); #else static inline int rbt_memtype_check_insert(struct memtype *new, - unsigned long *new_type) + enum page_cache_mode *new_type) { return 0; } static inline struct memtype *rbt_memtype_erase(u64 start, u64 end) { return NULL; } diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 415f6c4ced3..6582adcc8bd 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -122,11 +122,12 @@ static struct memtype *memtype_rb_exact_match(struct rb_root *root, static int memtype_rb_check_conflict(struct rb_root *root, u64 start, u64 end, - unsigned long reqtype, unsigned long *newtype) + enum page_cache_mode reqtype, + enum page_cache_mode *newtype) { struct rb_node *node; struct memtype *match; - int found_type = reqtype; + enum page_cache_mode found_type = reqtype; match = memtype_rb_lowest_match(&memtype_rbroot, start, end); if (match == NULL) @@ -187,7 +188,8 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb); } -int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) +int rbt_memtype_check_insert(struct memtype *new, + enum page_cache_mode *ret_type) { int err = 0; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 37c1435889c..9b18ef315a5 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -433,14 +433,14 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, return -EINVAL; if (pat_enabled && write_combine) - prot |= _PAGE_CACHE_WC; + prot |= cachemode2protval(_PAGE_CACHE_MODE_WC); else if (pat_enabled || boot_cpu_data.x86 > 3) /* * ioremap() and ioremap_nocache() defaults to UC MINUS for now. * To avoid attribute conflicts, request UC MINUS here * as well. */ - prot |= _PAGE_CACHE_UC_MINUS; + prot |= cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS); vma->vm_page_prot = __pgprot(prot); diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c index 7307d9d12d1..2e565e65c89 100644 --- a/arch/x86/pci/numachip.c +++ b/arch/x86/pci/numachip.c @@ -103,7 +103,7 @@ static int pci_mmcfg_write_numachip(unsigned int seg, unsigned int bus, return 0; } -const struct pci_raw_ops pci_mmcfg_numachip = { +static const struct pci_raw_ops pci_mmcfg_numachip = { .read = pci_mmcfg_read_numachip, .write = pci_mmcfg_write_numachip, }; diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 093f5f4272d..1819a91bbb9 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -229,7 +229,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 1; list_for_each_entry(msidesc, &dev->msi_list, list) { - __read_msi_msg(msidesc, &msg); + __pci_read_msi_msg(msidesc, &msg); pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); if (msg.data != XEN_PIRQ_MSI_DATA || @@ -240,7 +240,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) goto error; } xen_msi_compose_msg(dev, pirq, &msg); - __write_msi_msg(msidesc, &msg); + __pci_write_msi_msg(msidesc, &msg); dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq); } else { dev_dbg(&dev->dev, @@ -394,14 +394,7 @@ static void xen_teardown_msi_irq(unsigned int irq) { xen_destroy_irq(irq); } -static u32 xen_nop_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) -{ - return 0; -} -static u32 xen_nop_msix_mask_irq(struct msi_desc *desc, u32 flag) -{ - return 0; -} + #endif int __init pci_xen_init(void) @@ -425,8 +418,7 @@ int __init pci_xen_init(void) x86_msi.setup_msi_irqs = xen_setup_msi_irqs; x86_msi.teardown_msi_irq = xen_teardown_msi_irq; x86_msi.teardown_msi_irqs = xen_teardown_msi_irqs; - x86_msi.msi_mask_irq = xen_nop_msi_mask_irq; - x86_msi.msix_mask_irq = xen_nop_msix_mask_irq; + pci_msi_ignore_mask = 1; #endif return 0; } @@ -506,8 +498,7 @@ int __init pci_xen_initial_domain(void) x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs; x86_msi.teardown_msi_irq = xen_teardown_msi_irq; x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs; - x86_msi.msi_mask_irq = xen_nop_msi_mask_irq; - x86_msi.msix_mask_irq = xen_nop_msix_mask_irq; + pci_msi_ignore_mask = 1; #endif xen_setup_acpi_sci(); __acpi_register_gsi = acpi_register_gsi_xen; diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index f15103dff4b..d143d216d52 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -40,20 +40,40 @@ void __init efi_bgrt_init(void) if (ACPI_FAILURE(status)) return; - if (bgrt_tab->header.length < sizeof(*bgrt_tab)) + if (bgrt_tab->header.length < sizeof(*bgrt_tab)) { + pr_err("Ignoring BGRT: invalid length %u (expected %zu)\n", + bgrt_tab->header.length, sizeof(*bgrt_tab)); return; - if (bgrt_tab->version != 1 || bgrt_tab->status != 1) + } + if (bgrt_tab->version != 1) { + pr_err("Ignoring BGRT: invalid version %u (expected 1)\n", + bgrt_tab->version); + return; + } + if (bgrt_tab->status != 1) { + pr_err("Ignoring BGRT: invalid status %u (expected 1)\n", + bgrt_tab->status); + return; + } + if (bgrt_tab->image_type != 0) { + pr_err("Ignoring BGRT: invalid image type %u (expected 0)\n", + bgrt_tab->image_type); return; - if (bgrt_tab->image_type != 0 || !bgrt_tab->image_address) + } + if (!bgrt_tab->image_address) { + pr_err("Ignoring BGRT: null image address\n"); return; + } image = efi_lookup_mapped_addr(bgrt_tab->image_address); if (!image) { image = early_memremap(bgrt_tab->image_address, sizeof(bmp_header)); ioremapped = true; - if (!image) + if (!image) { + pr_err("Ignoring BGRT: failed to map image header memory\n"); return; + } } memcpy_fromio(&bmp_header, image, sizeof(bmp_header)); @@ -61,14 +81,18 @@ void __init efi_bgrt_init(void) early_iounmap(image, sizeof(bmp_header)); bgrt_image_size = bmp_header.size; - bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL); - if (!bgrt_image) + bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN); + if (!bgrt_image) { + pr_err("Ignoring BGRT: failed to allocate memory for image (wanted %zu bytes)\n", + bgrt_image_size); return; + } if (ioremapped) { image = early_memremap(bgrt_tab->image_address, bmp_header.size); if (!image) { + pr_err("Ignoring BGRT: failed to map image memory\n"); kfree(bgrt_image); bgrt_image = NULL; return; diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 850da94fef3..dbc8627a5cd 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -70,17 +70,7 @@ static efi_config_table_type_t arch_tables[] __initdata = { u64 efi_setup; /* efi setup_data physical address */ -static bool disable_runtime __initdata = false; -static int __init setup_noefi(char *arg) -{ - disable_runtime = true; - return 0; -} -early_param("noefi", setup_noefi); - -int add_efi_memmap; -EXPORT_SYMBOL(add_efi_memmap); - +static int add_efi_memmap __initdata; static int __init setup_add_efi_memmap(char *arg) { add_efi_memmap = 1; @@ -96,7 +86,7 @@ static efi_status_t __init phys_efi_set_virtual_address_map( { efi_status_t status; - efi_call_phys_prelog(); + efi_call_phys_prolog(); status = efi_call_phys(efi_phys.set_virtual_address_map, memory_map_size, descriptor_size, descriptor_version, virtual_map); @@ -210,9 +200,12 @@ static void __init print_efi_memmap(void) for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { + char buf[64]; + md = p; - pr_info("mem%02u: type=%u, attr=0x%llx, range=[0x%016llx-0x%016llx) (%lluMB)\n", - i, md->type, md->attribute, md->phys_addr, + pr_info("mem%02u: %s range=[0x%016llx-0x%016llx) (%lluMB)\n", + i, efi_md_typeattr_format(buf, sizeof(buf), md), + md->phys_addr, md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), (md->num_pages >> (20 - EFI_PAGE_SHIFT))); } @@ -344,9 +337,9 @@ static int __init efi_runtime_init32(void) } /* - * We will only need *early* access to the following two - * EFI runtime services before set_virtual_address_map - * is invoked. + * We will only need *early* access to the SetVirtualAddressMap + * EFI runtime service. All other runtime services will be called + * via the virtual mapping. */ efi_phys.set_virtual_address_map = (efi_set_virtual_address_map_t *) @@ -368,9 +361,9 @@ static int __init efi_runtime_init64(void) } /* - * We will only need *early* access to the following two - * EFI runtime services before set_virtual_address_map - * is invoked. + * We will only need *early* access to the SetVirtualAddressMap + * EFI runtime service. All other runtime services will be called + * via the virtual mapping. */ efi_phys.set_virtual_address_map = (efi_set_virtual_address_map_t *) @@ -492,7 +485,7 @@ void __init efi_init(void) if (!efi_runtime_supported()) pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); else { - if (disable_runtime || efi_runtime_init()) + if (efi_runtime_disabled() || efi_runtime_init()) return; } if (efi_memmap_init()) @@ -537,7 +530,7 @@ void __init runtime_code_page_mkexec(void) } } -void efi_memory_uc(u64 addr, unsigned long size) +void __init efi_memory_uc(u64 addr, unsigned long size) { unsigned long page_shift = 1UL << EFI_PAGE_SHIFT; u64 npages; @@ -732,6 +725,7 @@ static void __init kexec_enter_virtual_mode(void) */ if (!efi_is_native()) { efi_unmap_memmap(); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; } @@ -805,6 +799,7 @@ static void __init __efi_enter_virtual_mode(void) new_memmap = efi_map_regions(&count, &pg_shift); if (!new_memmap) { pr_err("Error reallocating memory, EFI runtime non-functional!\n"); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; } @@ -812,8 +807,10 @@ static void __init __efi_enter_virtual_mode(void) BUG_ON(!efi.systab); - if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift)) + if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift)) { + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; + } efi_sync_low_kernel_mappings(); efi_dump_pagetable(); @@ -938,14 +935,11 @@ u64 efi_mem_attributes(unsigned long phys_addr) return 0; } -static int __init parse_efi_cmdline(char *str) +static int __init arch_parse_efi_cmdline(char *str) { - if (*str == '=') - str++; - - if (!strncmp(str, "old_map", 7)) + if (parse_option_str(str, "old_map")) set_bit(EFI_OLD_MEMMAP, &efi.flags); return 0; } -early_param("efi", parse_efi_cmdline); +early_param("efi", arch_parse_efi_cmdline); diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 9ee3491e31f..40e7cda5293 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -33,7 +33,7 @@ /* * To make EFI call EFI runtime service in physical addressing mode we need - * prelog/epilog before/after the invocation to disable interrupt, to + * prolog/epilog before/after the invocation to disable interrupt, to * claim EFI runtime service handler exclusively and to duplicate a memory in * low memory space say 0 - 3G. */ @@ -41,11 +41,13 @@ static unsigned long efi_rt_eflags; void efi_sync_low_kernel_mappings(void) {} void __init efi_dump_pagetable(void) {} -int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) +int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { return 0; } -void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) {} +void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) +{ +} void __init efi_map_region(efi_memory_desc_t *md) { @@ -55,7 +57,7 @@ void __init efi_map_region(efi_memory_desc_t *md) void __init efi_map_region_fixed(efi_memory_desc_t *md) {} void __init parse_efi_setup(u64 phys_addr, u32 data_len) {} -void efi_call_phys_prelog(void) +void __init efi_call_phys_prolog(void) { struct desc_ptr gdt_descr; @@ -69,7 +71,7 @@ void efi_call_phys_prelog(void) load_gdt(&gdt_descr); } -void efi_call_phys_epilog(void) +void __init efi_call_phys_epilog(void) { struct desc_ptr gdt_descr; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 290d397e1dd..17e80d829df 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -48,8 +48,7 @@ static unsigned long efi_flags __initdata; * We allocate runtime services regions bottom-up, starting from -4G, i.e. * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G. */ -static u64 efi_va = -4 * (1UL << 30); -#define EFI_VA_END (-68 * (1UL << 30)) +static u64 efi_va = EFI_VA_START; /* * Scratch space used for switching the pagetable in the EFI stub @@ -79,7 +78,7 @@ static void __init early_code_mapping_set_exec(int executable) } } -void __init efi_call_phys_prelog(void) +void __init efi_call_phys_prolog(void) { unsigned long vaddress; int pgd; @@ -139,7 +138,7 @@ void efi_sync_low_kernel_mappings(void) sizeof(pgd_t) * num_pgds); } -int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) +int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { unsigned long text; struct page *page; @@ -192,7 +191,7 @@ int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) return 0; } -void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) +void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) { pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd); diff --git a/arch/x86/platform/efi/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S index fbe66e626c0..040192b50d0 100644 --- a/arch/x86/platform/efi/efi_stub_32.S +++ b/arch/x86/platform/efi/efi_stub_32.S @@ -27,13 +27,13 @@ ENTRY(efi_call_phys) * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found * the values of these registers are the same. And, the corresponding * GDT entries are identical. So I will do nothing about segment reg - * and GDT, but change GDT base register in prelog and epilog. + * and GDT, but change GDT base register in prolog and epilog. */ /* * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET. * But to make it smoothly switch from virtual mode to flat mode. - * The mapping of lower virtual memory has been created in prelog and + * The mapping of lower virtual memory has been created in prolog and * epilog. */ movl $1f, %edx diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h index 46aa25c8ce0..3c1c3866d82 100644 --- a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h +++ b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h @@ -10,10 +10,9 @@ */ -/* __attribute__((weak)) makes these declarations overridable */ /* For every CPU addition a new get_<cpuname>_ops interface needs * to be added. */ -extern void *get_penwell_ops(void) __attribute__((weak)); -extern void *get_cloverview_ops(void) __attribute__((weak)); -extern void *get_tangier_ops(void) __attribute__((weak)); +extern void *get_penwell_ops(void); +extern void *get_cloverview_ops(void); +extern void *get_tangier_ops(void); diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index 3c53a90fdb1..c14ad34776c 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -106,6 +106,7 @@ int __init sfi_parse_mtmr(struct sfi_table_header *table) mp_irq.dstapic = MP_APIC_ALL; mp_irq.dstirq = pentry->irq; mp_save_irq(&mp_irq); + mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC); } return 0; @@ -176,6 +177,7 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) mp_irq.dstapic = MP_APIC_ALL; mp_irq.dstirq = pentry->irq; mp_save_irq(&mp_irq); + mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC); } return 0; } diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 3968d67d366..994798548b1 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1367,23 +1367,25 @@ static int ptc_seq_show(struct seq_file *file, void *data) cpu = *(loff_t *)data; if (!cpu) { - seq_printf(file, - "# cpu bauoff sent stime self locals remotes ncpus localhub "); - seq_printf(file, - "remotehub numuvhubs numuvhubs16 numuvhubs8 "); - seq_printf(file, - "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries "); - seq_printf(file, - "rok resetp resett giveup sto bz throt disable "); - seq_printf(file, - "enable wars warshw warwaits enters ipidis plugged "); - seq_printf(file, - "ipiover glim cong swack recv rtime all one mult "); - seq_printf(file, - "none retry canc nocan reset rcan\n"); + seq_puts(file, + "# cpu bauoff sent stime self locals remotes ncpus localhub "); + seq_puts(file, "remotehub numuvhubs numuvhubs16 numuvhubs8 "); + seq_puts(file, + "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries "); + seq_puts(file, + "rok resetp resett giveup sto bz throt disable "); + seq_puts(file, + "enable wars warshw warwaits enters ipidis plugged "); + seq_puts(file, + "ipiover glim cong swack recv rtime all one mult "); + seq_puts(file, "none retry canc nocan reset rcan\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { bcp = &per_cpu(bau_control, cpu); + if (bcp->nobau) { + seq_printf(file, "cpu %d bau disabled\n", cpu); + return 0; + } stat = bcp->statp; /* source side statistics */ seq_printf(file, diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index f52e033557c..2c835e35634 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -24,6 +24,7 @@ quiet_cmd_bin2c = BIN2C $@ $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE $(call if_changed,bin2c) + @: obj-$(CONFIG_KEXEC_FILE) += kexec-purgatory.o diff --git a/arch/x86/tools/calc_run_size.pl b/arch/x86/tools/calc_run_size.pl new file mode 100644 index 00000000000..23210baade2 --- /dev/null +++ b/arch/x86/tools/calc_run_size.pl @@ -0,0 +1,39 @@ +#!/usr/bin/perl +# +# Calculate the amount of space needed to run the kernel, including room for +# the .bss and .brk sections. +# +# Usage: +# objdump -h a.out | perl calc_run_size.pl +use strict; + +my $mem_size = 0; +my $file_offset = 0; + +my $sections=" *[0-9]+ \.(?:bss|brk) +"; +while (<>) { + if (/^$sections([0-9a-f]+) +(?:[0-9a-f]+ +){2}([0-9a-f]+)/) { + my $size = hex($1); + my $offset = hex($2); + $mem_size += $size; + if ($file_offset == 0) { + $file_offset = $offset; + } elsif ($file_offset != $offset) { + # BFD linker shows the same file offset in ELF. + # Gold linker shows them as consecutive. + next if ($file_offset + $mem_size == $offset + $size); + + printf STDERR "file_offset: 0x%lx\n", $file_offset; + printf STDERR "mem_size: 0x%lx\n", $mem_size; + printf STDERR "offset: 0x%lx\n", $offset; + printf STDERR "size: 0x%lx\n", $size; + + die ".bss and .brk are non-contiguous\n"; + } + } +} + +if ($file_offset == 0) { + die "Never found .bss or .brk file offset\n"; +} +printf("%d\n", $mem_size + $file_offset); diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c index 872eb60e780..ba70ff23291 100644 --- a/arch/x86/tools/insn_sanity.c +++ b/arch/x86/tools/insn_sanity.c @@ -254,7 +254,7 @@ int main(int argc, char **argv) continue; /* Decode an instruction */ - insn_init(&insn, insn_buf, x86_64); + insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64); insn_get_length(&insn); if (insn.next_byte <= insn.kaddr || diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index a5efb21d522..0c2fae8d929 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -20,7 +20,10 @@ struct relocs { static struct relocs relocs16; static struct relocs relocs32; +#if ELF_BITS == 64 +static struct relocs relocs32neg; static struct relocs relocs64; +#endif struct section { Elf_Shdr shdr; @@ -762,11 +765,16 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, switch (r_type) { case R_X86_64_NONE: + /* NONE can be ignored. */ + break; + case R_X86_64_PC32: /* - * NONE can be ignored and PC relative relocations don't - * need to be adjusted. + * PC relative relocations don't need to be adjusted unless + * referencing a percpu symbol. */ + if (is_percpu_sym(sym, symname)) + add_reloc(&relocs32neg, offset); break; case R_X86_64_32: @@ -986,7 +994,10 @@ static void emit_relocs(int as_text, int use_real_mode) /* Order the relocations for more efficient processing */ sort_relocs(&relocs16); sort_relocs(&relocs32); +#if ELF_BITS == 64 + sort_relocs(&relocs32neg); sort_relocs(&relocs64); +#endif /* Print the relocations */ if (as_text) { @@ -1007,14 +1018,21 @@ static void emit_relocs(int as_text, int use_real_mode) for (i = 0; i < relocs32.count; i++) write_reloc(relocs32.offset[i], stdout); } else { - if (ELF_BITS == 64) { - /* Print a stop */ - write_reloc(0, stdout); +#if ELF_BITS == 64 + /* Print a stop */ + write_reloc(0, stdout); - /* Now print each relocation */ - for (i = 0; i < relocs64.count; i++) - write_reloc(relocs64.offset[i], stdout); - } + /* Now print each relocation */ + for (i = 0; i < relocs64.count; i++) + write_reloc(relocs64.offset[i], stdout); + + /* Print a stop */ + write_reloc(0, stdout); + + /* Now print each inverse 32-bit relocation */ + for (i = 0; i < relocs32neg.count; i++) + write_reloc(relocs32neg.offset[i], stdout); +#endif /* Print a stop */ write_reloc(0, stdout); diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index 13403fc95a9..56f04db0c9c 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -149,7 +149,7 @@ int main(int argc, char **argv) break; } /* Decode an instruction */ - insn_init(&insn, insn_buf, x86_64); + insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64); insn_get_length(&insn); if (insn.length != nb) { warnings++; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1a3f0445432..6bf3a13e3e0 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1100,12 +1100,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) /* Fast syscall setup is all done in hypercalls, so these are all ignored. Stub them out here to stop Xen console noise. */ - break; - - case MSR_IA32_CR_PAT: - if (smp_processor_id() == 0) - xen_set_pat(((u64)high << 32) | low); - break; default: ret = native_write_msr_safe(msr, low, high); @@ -1561,10 +1555,6 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Prevent unwanted bits from being set in PTEs. */ __supported_pte_mask &= ~_PAGE_GLOBAL; -#if 0 - if (!xen_initial_domain()) -#endif - __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); /* * Prevent page tables from being allocated in highmem, even @@ -1618,14 +1608,6 @@ asmlinkage __visible void __init xen_start_kernel(void) */ acpi_numa = -1; #endif -#ifdef CONFIG_X86_PAT - /* - * For right now disable the PAT. We should remove this once - * git commit 8eaffa67b43e99ae581622c5133e20b0f48bcef1 - * (xen/pat: Disable PAT support for now) is reverted. - */ - pat_enabled = 0; -#endif /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; @@ -1636,8 +1618,12 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_raw_console_write("mapping kernel into physical memory\n"); xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); - /* Allocate and initialize top and mid mfn levels for p2m structure */ - xen_build_mfn_list_list(); + /* + * Modify the cache mode translation tables to match Xen's PAT + * configuration. + */ + + pat_init_cache_modes(); /* keep using Xen gdt for now; no urgent need to change it */ diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f62af7647ec..9855eb8ee4b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -410,13 +410,7 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) __visible pteval_t xen_pte_val(pte_t pte) { pteval_t pteval = pte.pte; -#if 0 - /* If this is a WC pte, convert back from Xen WC to Linux WC */ - if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) { - WARN_ON(!pat_enabled); - pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT; - } -#endif + return pte_mfn_to_pfn(pteval); } PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); @@ -427,47 +421,8 @@ __visible pgdval_t xen_pgd_val(pgd_t pgd) } PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); -/* - * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7 - * are reserved for now, to correspond to the Intel-reserved PAT - * types. - * - * We expect Linux's PAT set as follows: - * - * Idx PTE flags Linux Xen Default - * 0 WB WB WB - * 1 PWT WC WT WT - * 2 PCD UC- UC- UC- - * 3 PCD PWT UC UC UC - * 4 PAT WB WC WB - * 5 PAT PWT WC WP WT - * 6 PAT PCD UC- rsv UC- - * 7 PAT PCD PWT UC rsv UC - */ - -void xen_set_pat(u64 pat) -{ - /* We expect Linux to use a PAT setting of - * UC UC- WC WB (ignoring the PAT flag) */ - WARN_ON(pat != 0x0007010600070106ull); -} - __visible pte_t xen_make_pte(pteval_t pte) { -#if 0 - /* If Linux is trying to set a WC pte, then map to the Xen WC. - * If _PAGE_PAT is set, then it probably means it is really - * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope - * things work out OK... - * - * (We should never see kernel mappings with _PAGE_PSE set, - * but we could see hugetlbfs mappings, I think.). - */ - if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) { - if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT) - pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT; - } -#endif pte = pte_pfn_to_mfn(pte); return native_make_pte(pte); @@ -1217,10 +1172,13 @@ static void __init xen_pagetable_p2m_copy(void) static void __init xen_pagetable_init(void) { paging_init(); - xen_setup_shared_info(); #ifdef CONFIG_X86_64 xen_pagetable_p2m_copy(); #endif + /* Allocate and initialize top and mid mfn levels for p2m structure */ + xen_build_mfn_list_list(); + + xen_setup_shared_info(); xen_post_allocator_init(); } static void xen_write_cr2(unsigned long cr2) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 9f5983b01ed..b456b048eca 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -163,6 +163,7 @@ #include <linux/hash.h> #include <linux/sched.h> #include <linux/seq_file.h> +#include <linux/bootmem.h> #include <asm/cache.h> #include <asm/setup.h> @@ -181,21 +182,20 @@ static void __init m2p_override_init(void); unsigned long xen_max_p2m_pfn __read_mostly; +static unsigned long *p2m_mid_missing_mfn; +static unsigned long *p2m_top_mfn; +static unsigned long **p2m_top_mfn_p; + /* Placeholders for holes in the address space */ static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE); RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); -RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); /* For each I/O range remapped we may lose up to two leaf pages for the boundary * violations and three mid pages to cover up to 3GB. With @@ -272,11 +272,11 @@ static void p2m_init(unsigned long *p2m) * Build the parallel p2m_top_mfn and p2m_mid_mfn structures * * This is called both at boot time, and after resuming from suspend: - * - At boot time we're called very early, and must use extend_brk() + * - At boot time we're called rather early, and must use alloc_bootmem*() * to allocate memory. * * - After resume we're called from within stop_machine, but the mfn - * tree should alreay be completely allocated. + * tree should already be completely allocated. */ void __ref xen_build_mfn_list_list(void) { @@ -287,20 +287,17 @@ void __ref xen_build_mfn_list_list(void) /* Pre-initialize p2m_top_mfn to be completely missing */ if (p2m_top_mfn == NULL) { - p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_missing_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); - p2m_mid_identity_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); - p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); p2m_top_mfn_p_init(p2m_top_mfn_p); - p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); p2m_top_mfn_init(p2m_top_mfn); } else { /* Reinitialise, mfn's all change after migration */ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); - p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); } for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { @@ -328,10 +325,9 @@ void __ref xen_build_mfn_list_list(void) /* * XXX boot-time only! We should never find * missing parts of the mfn tree after - * runtime. extend_brk() will BUG if we call - * it too late. + * runtime. */ - mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + mid_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); p2m_mid_mfn_init(mid_mfn_p, p2m_missing); p2m_top_mfn_p[topidx] = mid_mfn_p; @@ -415,7 +411,6 @@ void __init xen_build_dynamic_phys_to_machine(void) m2p_override_init(); } #ifdef CONFIG_X86_64 -#include <linux/bootmem.h> unsigned long __init xen_revector_p2m_tree(void) { unsigned long va_start; @@ -477,7 +472,6 @@ unsigned long __init xen_revector_p2m_tree(void) copy_page(new, mid_p); p2m_top[topidx][mididx] = &mfn_list[pfn_free]; - p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn_free]); pfn_free += P2M_PER_PAGE; @@ -538,12 +532,13 @@ static bool alloc_p2m(unsigned long pfn) unsigned topidx, mididx; unsigned long ***top_p, **mid; unsigned long *top_mfn_p, *mid_mfn; + unsigned long *p2m_orig; topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); top_p = &p2m_top[topidx]; - mid = *top_p; + mid = ACCESS_ONCE(*top_p); if (mid == p2m_mid_missing) { /* Mid level is missing, allocate a new one */ @@ -558,7 +553,7 @@ static bool alloc_p2m(unsigned long pfn) } top_mfn_p = &p2m_top_mfn[topidx]; - mid_mfn = p2m_top_mfn_p[topidx]; + mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); @@ -566,6 +561,7 @@ static bool alloc_p2m(unsigned long pfn) /* Separately check the mid mfn level */ unsigned long missing_mfn; unsigned long mid_mfn_mfn; + unsigned long old_mfn; mid_mfn = alloc_p2m_page(); if (!mid_mfn) @@ -575,17 +571,19 @@ static bool alloc_p2m(unsigned long pfn) missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); mid_mfn_mfn = virt_to_mfn(mid_mfn); - if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) + old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn); + if (old_mfn != missing_mfn) { free_p2m_page(mid_mfn); - else + mid_mfn = mfn_to_virt(old_mfn); + } else { p2m_top_mfn_p[topidx] = mid_mfn; + } } - if (p2m_top[topidx][mididx] == p2m_identity || - p2m_top[topidx][mididx] == p2m_missing) { + p2m_orig = ACCESS_ONCE(p2m_top[topidx][mididx]); + if (p2m_orig == p2m_identity || p2m_orig == p2m_missing) { /* p2m leaf page is missing */ unsigned long *p2m; - unsigned long *p2m_orig = p2m_top[topidx][mididx]; p2m = alloc_p2m_page(); if (!p2m) @@ -606,7 +604,6 @@ static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) { unsigned topidx, mididx, idx; unsigned long *p2m; - unsigned long *mid_mfn_p; topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); @@ -633,43 +630,21 @@ static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) p2m_top[topidx][mididx] = p2m; - /* For save/restore we need to MFN of the P2M saved */ - - mid_mfn_p = p2m_top_mfn_p[topidx]; - WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), - "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", - topidx, mididx); - mid_mfn_p[mididx] = virt_to_mfn(p2m); - return true; } static bool __init early_alloc_p2m_middle(unsigned long pfn) { unsigned topidx = p2m_top_index(pfn); - unsigned long *mid_mfn_p; unsigned long **mid; mid = p2m_top[topidx]; - mid_mfn_p = p2m_top_mfn_p[topidx]; if (mid == p2m_mid_missing) { mid = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_mid_init(mid, p2m_missing); p2m_top[topidx] = mid; - - BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); - } - /* And the save/restore P2M tables.. */ - if (mid_mfn_p == p2m_mid_missing_mfn) { - mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p, p2m_missing); - - p2m_top_mfn_p[topidx] = mid_mfn_p; - p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); - /* Note: we don't set mid_mfn_p[midix] here, - * look in early_alloc_p2m() */ } return true; } @@ -680,14 +655,13 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn) * replace the P2M leaf with a p2m_missing or p2m_identity. * Stick the old page in the new P2M tree location. */ -bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn) +static bool __init early_can_reuse_p2m_middle(unsigned long set_pfn) { unsigned topidx; unsigned mididx; unsigned ident_pfns; unsigned inv_pfns; unsigned long *p2m; - unsigned long *mid_mfn_p; unsigned idx; unsigned long pfn; @@ -733,11 +707,6 @@ bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_ found: /* Found one, replace old with p2m_identity or p2m_missing */ p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing); - /* And the other for save/restore.. */ - mid_mfn_p = p2m_top_mfn_p[topidx]; - /* NOTE: Even if it is a p2m_identity it should still be point to - * a page filled with INVALID_P2M_ENTRY entries. */ - mid_mfn_p[mididx] = virt_to_mfn(p2m_missing); /* Reset where we want to stick the old page in. */ topidx = p2m_top_index(set_pfn); @@ -752,8 +721,6 @@ found: p2m_init(p2m); p2m_top[topidx][mididx] = p2m; - mid_mfn_p = p2m_top_mfn_p[topidx]; - mid_mfn_p[mididx] = virt_to_mfn(p2m); return true; } @@ -763,7 +730,7 @@ bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) if (!early_alloc_p2m_middle(pfn)) return false; - if (early_can_reuse_p2m_middle(pfn, mfn)) + if (early_can_reuse_p2m_middle(pfn)) return __set_phys_to_machine(pfn, mfn); if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/)) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index af7216128d9..29834b3fd87 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -595,6 +595,7 @@ char * __init xen_memory_setup(void) rc = 0; } BUG_ON(rc); + BUG_ON(memmap.nr_entries == 0); /* * Xen won't allow a 1:1 mapping to be created to UNUSABLE diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 8650cdb5320..4c071aeb841 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -510,6 +510,9 @@ static void xen_cpu_die(unsigned int cpu) current->state = TASK_UNINTERRUPTIBLE; schedule_timeout(HZ/10); } + + cpu_die_common(cpu); + xen_smp_intr_free(cpu); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index a1d430b112b..f473d268d38 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -158,7 +158,7 @@ cycle_t xen_clocksource_read(void) cycle_t ret; preempt_disable_notrace(); - src = this_cpu_ptr(&xen_vcpu->time); + src = &__this_cpu_read(xen_vcpu)->time; ret = pvclock_clocksource_read(src); preempt_enable_notrace(); return ret; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 28c7e0be56e..4ab9298c5e1 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -33,7 +33,6 @@ extern unsigned long xen_max_p2m_pfn; void xen_mm_pin_all(void); void xen_mm_unpin_all(void); -void xen_set_pat(u64); char * __init xen_memory_setup(void); char * xen_auto_xlated_memory_setup(void); |