diff options
Diffstat (limited to 'arch/x86')
250 files changed, 7968 insertions, 4219 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b32ebf92b0c..725e1573ea8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -16,6 +16,7 @@ config X86_64 def_bool y depends on 64BIT select X86_DEV_DMA_OPS + select ARCH_USE_CMPXCHG_LOCKREF ### Arch settings config X86 @@ -81,8 +82,6 @@ config X86 select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE select HAVE_ARCH_JUMP_LABEL - select HAVE_TEXT_POKE_SMP - select HAVE_GENERIC_HARDIRQS select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select SPARSE_IRQ select GENERIC_FIND_FIRST_BIT @@ -124,6 +123,7 @@ config X86 select COMPAT_OLD_SIGACTION if IA32_EMULATION select RTC_LIB select HAVE_DEBUG_STACKOVERFLOW + select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 config INSTRUCTION_DECODER def_bool y @@ -482,11 +482,12 @@ config X86_INTEL_LPSS bool "Intel Low Power Subsystem Support" depends on ACPI select COMMON_CLK + select PINCTRL ---help--- Select to build support for Intel Low Power Subsystem such as found on Intel Lynxpoint PCH. Selecting this option enables - things like clock tree (common clock framework) which are needed - by the LPSS peripheral drivers. + things like clock tree (common clock framework) and pincontrol + which are needed by the LPSS peripheral drivers. config X86_RDC321X bool "RDC R-321x SoC" @@ -632,6 +633,7 @@ config PARAVIRT_DEBUG config PARAVIRT_SPINLOCKS bool "Paravirtualization layer for spinlocks" depends on PARAVIRT && SMP + select UNINLINE_SPIN_UNLOCK ---help--- Paravirtualized spinlocks allow a pvops backend to replace the spinlock implementation with something virtualization-friendly @@ -656,6 +658,15 @@ config KVM_GUEST underlying device model, the host provides the guest with timing infrastructure such as time of day, and system time +config KVM_DEBUG_FS + bool "Enable debug information for KVM Guests in debugfs" + depends on KVM_GUEST && DEBUG_FS + default n + ---help--- + This option enables collection of various statistics for KVM guest. + Statistics are displayed in debugfs filesystem. Enabling this option + may incur significant overhead. + source "arch/x86/lguest/Kconfig" config PARAVIRT_TIME_ACCOUNTING @@ -746,20 +757,25 @@ config DMI BIOS code. config GART_IOMMU - bool "GART IOMMU support" if EXPERT - default y + bool "Old AMD GART IOMMU support" select SWIOTLB depends on X86_64 && PCI && AMD_NB ---help--- - Support for full DMA access of devices with 32bit memory access only - on systems with more than 3GB. This is usually needed for USB, - sound, many IDE/SATA chipsets and some other devices. - Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART - based hardware IOMMU and a software bounce buffer based IOMMU used - on Intel systems and as fallback. - The code is only active when needed (enough memory and limited - device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified - too. + Provides a driver for older AMD Athlon64/Opteron/Turion/Sempron + GART based hardware IOMMUs. + + The GART supports full DMA access for devices with 32-bit access + limitations, on systems with more than 3 GB. This is usually needed + for USB, sound, many IDE/SATA chipsets and some other devices. + + Newer systems typically have a modern AMD IOMMU, supported via + the CONFIG_AMD_IOMMU=y config option. + + In normal configurations this driver is only active when needed: + there's more than 3 GB of memory and the system contains a + 32-bit limited device. + + If unsure, say Y. config CALGARY_IOMMU bool "IBM Calgary IOMMU support" @@ -815,14 +831,16 @@ config MAXSMP config NR_CPUS int "Maximum number of CPUs" if SMP && !MAXSMP range 2 8 if SMP && X86_32 && !X86_BIGSMP - range 2 512 if SMP && !MAXSMP + range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK + range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64 default "1" if !SMP - default "4096" if MAXSMP + default "8192" if MAXSMP default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000) default "8" if SMP ---help--- This allows you to specify the maximum number of CPUs which this - kernel will support. The maximum supported value is 512 and the + kernel will support. If CPUMASK_OFFSTACK is enabled, the maximum + supported value is 4096, otherwise the maximum value is 512. The minimum value which makes sense is 2. This is purely to save memory - each supported CPU adds @@ -850,7 +868,7 @@ source "kernel/Kconfig.preempt" config X86_UP_APIC bool "Local APIC support on uniprocessors" - depends on X86_32 && !SMP && !X86_32_NON_STANDARD + depends on X86_32 && !SMP && !X86_32_NON_STANDARD && !PCI_MSI ---help--- A local APIC (Advanced Programmable Interrupt Controller) is an integrated interrupt controller in the CPU. If you have a single-CPU @@ -875,11 +893,11 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y - depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC + depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI config X86_IO_APIC def_bool y - depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC + depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI config X86_VISWS_APIC def_bool y @@ -1023,6 +1041,7 @@ config X86_REBOOTFIXUPS config MICROCODE tristate "CPU microcode loading support" + depends on CPU_SUP_AMD || CPU_SUP_INTEL select FW_LOADER ---help--- @@ -1344,8 +1363,12 @@ config ARCH_SELECT_MEMORY_MODEL depends on ARCH_SPARSEMEM_ENABLE config ARCH_MEMORY_PROBE - def_bool y + bool "Enable sysfs memory/probe interface" depends on X86_64 && MEMORY_HOTPLUG + help + This option enables a sysfs memory/probe interface for testing. + See Documentation/memory-hotplug.txt for more information. + If you are unsure how to answer this question, answer N. config ARCH_PROC_KCORE_TEXT def_bool y @@ -1579,7 +1602,7 @@ config EFI_STUB This kernel feature allows a bzImage to be loaded directly by EFI firmware without the use of a bootloader. - See Documentation/x86/efi-stub.txt for more information. + See Documentation/efi-stub.txt for more information. config SECCOMP def_bool y @@ -1627,9 +1650,9 @@ config KEXEC It is an ongoing process to be certain the hardware in a machine is properly shutdown, so do not be surprised if this code does not - initially work for you. It may help to enable device hotplugging - support. As of this writing the exact hardware interface is - strongly in flux, so no good recommendation can be made. + initially work for you. As of this writing the exact hardware + interface is strongly in flux, so no good recommendation can be + made. config CRASH_DUMP bool "kernel crash dumps" @@ -1716,9 +1739,10 @@ config X86_NEED_RELOCS depends on X86_32 && RELOCATABLE config PHYSICAL_ALIGN - hex "Alignment value to which kernel should be aligned" if X86_32 + hex "Alignment value to which kernel should be aligned" default "0x1000000" - range 0x2000 0x1000000 + range 0x2000 0x1000000 if X86_32 + range 0x200000 0x1000000 if X86_64 ---help--- This value puts the alignment restrictions on physical address where kernel is loaded and run from. Kernel is compiled for an @@ -1736,6 +1760,9 @@ config PHYSICAL_ALIGN end result is that kernel runs from a physical address meeting above alignment restrictions. + On 32-bit this value must be a multiple of 0x2000. On 64-bit + this value must be a multiple of 0x200000. + Don't change this unless you know what you are doing. config HOTPLUG_CPU @@ -2014,7 +2041,6 @@ menu "Bus options (PCI etc.)" config PCI bool "PCI support" default y - select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) ---help--- Find out whether you have a PCI motherboard. PCI is the name of a bus system, i.e. the way the CPU talks to the other stuff inside @@ -2270,6 +2296,32 @@ config RAPIDIO source "drivers/rapidio/Kconfig" +config X86_SYSFB + bool "Mark VGA/VBE/EFI FB as generic system framebuffer" + help + Firmwares often provide initial graphics framebuffers so the BIOS, + bootloader or kernel can show basic video-output during boot for + user-guidance and debugging. Historically, x86 used the VESA BIOS + Extensions and EFI-framebuffers for this, which are mostly limited + to x86. + This option, if enabled, marks VGA/VBE/EFI framebuffers as generic + framebuffers so the new generic system-framebuffer drivers can be + used on x86. If the framebuffer is not compatible with the generic + modes, it is adverticed as fallback platform framebuffer so legacy + drivers like efifb, vesafb and uvesafb can pick it up. + If this option is not selected, all system framebuffers are always + marked as fallback platform framebuffers as usual. + + Note: Legacy fbdev drivers, including vesafb, efifb, uvesafb, will + not be able to pick up generic system framebuffers if this option + is selected. You are highly encouraged to enable simplefb as + replacement if you select this option. simplefb can correctly deal + with generic system framebuffers. But you should still keep vesafb + and others enabled as fallback if a system framebuffer is + incompatible with simplefb. + + If unsure, say Y. + endmenu @@ -2332,10 +2384,6 @@ config HAVE_ATOMIC_IOMAP def_bool y depends on X86_32 -config HAVE_TEXT_POKE_SMP - bool - select STOP_MACHINE if SMP - config X86_DEV_DMA_OPS bool depends on X86_64 || STA2X11 diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 78d91afb8e5..0f3621ed1db 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -59,6 +59,16 @@ config EARLY_PRINTK_DBGP with klogd/syslogd or the X server. You should normally N here, unless you want to debug such a crash. You need usb debug device. +config EARLY_PRINTK_EFI + bool "Early printk via the EFI framebuffer" + depends on EFI && EARLY_PRINTK + select FONT_SUPPORT + ---help--- + Write kernel log output directly into the EFI framebuffer. + + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. + config X86_PTDUMP bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 07639c656fc..41250fb3398 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -16,6 +16,10 @@ endif # e.g.: obj-y += foo_$(BITS).o export BITS +ifdef CONFIG_X86_NEED_RELOCS + LDFLAGS_vmlinux := --emit-relocs +endif + ifeq ($(CONFIG_X86_32),y) BITS := 32 UTS_MACHINE := i386 @@ -25,10 +29,6 @@ ifeq ($(CONFIG_X86_32),y) KBUILD_AFLAGS += $(biarch) KBUILD_CFLAGS += $(biarch) - ifdef CONFIG_RELOCATABLE - LDFLAGS_vmlinux := --emit-relocs - endif - KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return # Never want PIC in a 32-bit kernel, prevent breakage with GCC built diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 379814bc41e..dce69a25689 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -71,7 +71,8 @@ GCOV_PROFILE := n $(obj)/bzImage: asflags-y := $(SVGA_MODE) quiet_cmd_image = BUILD $@ -cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/zoffset.h > $@ +cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \ + $(obj)/zoffset.h $@ $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE $(call if_changed,image) diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 5b7531966b8..ef72baeff48 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -355,6 +355,7 @@ int strncmp(const char *cs, const char *ct, size_t count); size_t strnlen(const char *s, size_t maxlen); unsigned int atou(const char *s); unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); +size_t strlen(const char *s); /* tty.c */ void puts(const char *); diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index b7388a425f0..a7677babf94 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -19,214 +19,10 @@ static efi_system_table_t *sys_table; -static void efi_char16_printk(efi_char16_t *str) -{ - struct efi_simple_text_output_protocol *out; - - out = (struct efi_simple_text_output_protocol *)sys_table->con_out; - efi_call_phys2(out->output_string, out, str); -} - -static void efi_printk(char *str) -{ - char *s8; - - for (s8 = str; *s8; s8++) { - efi_char16_t ch[2] = { 0 }; - - ch[0] = *s8; - if (*s8 == '\n') { - efi_char16_t nl[2] = { '\r', 0 }; - efi_char16_printk(nl); - } - - efi_char16_printk(ch); - } -} - -static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size, - unsigned long *desc_size) -{ - efi_memory_desc_t *m = NULL; - efi_status_t status; - unsigned long key; - u32 desc_version; - - *map_size = sizeof(*m) * 32; -again: - /* - * Add an additional efi_memory_desc_t because we're doing an - * allocation which may be in a new descriptor region. - */ - *map_size += sizeof(*m); - status = efi_call_phys3(sys_table->boottime->allocate_pool, - EFI_LOADER_DATA, *map_size, (void **)&m); - if (status != EFI_SUCCESS) - goto fail; - - status = efi_call_phys5(sys_table->boottime->get_memory_map, map_size, - m, &key, desc_size, &desc_version); - if (status == EFI_BUFFER_TOO_SMALL) { - efi_call_phys1(sys_table->boottime->free_pool, m); - goto again; - } - - if (status != EFI_SUCCESS) - efi_call_phys1(sys_table->boottime->free_pool, m); -fail: - *map = m; - return status; -} - -/* - * Allocate at the highest possible address that is not above 'max'. - */ -static efi_status_t high_alloc(unsigned long size, unsigned long align, - unsigned long *addr, unsigned long max) -{ - unsigned long map_size, desc_size; - efi_memory_desc_t *map; - efi_status_t status; - unsigned long nr_pages; - u64 max_addr = 0; - int i; - - status = __get_map(&map, &map_size, &desc_size); - if (status != EFI_SUCCESS) - goto fail; - - nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; -again: - for (i = 0; i < map_size / desc_size; i++) { - efi_memory_desc_t *desc; - unsigned long m = (unsigned long)map; - u64 start, end; - - desc = (efi_memory_desc_t *)(m + (i * desc_size)); - if (desc->type != EFI_CONVENTIONAL_MEMORY) - continue; - - if (desc->num_pages < nr_pages) - continue; +#include "../../../../drivers/firmware/efi/efi-stub-helper.c" - start = desc->phys_addr; - end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT); - if ((start + size) > end || (start + size) > max) - continue; - - if (end - size > max) - end = max; - - if (round_down(end - size, align) < start) - continue; - - start = round_down(end - size, align); - - /* - * Don't allocate at 0x0. It will confuse code that - * checks pointers against NULL. - */ - if (start == 0x0) - continue; - - if (start > max_addr) - max_addr = start; - } - - if (!max_addr) - status = EFI_NOT_FOUND; - else { - status = efi_call_phys4(sys_table->boottime->allocate_pages, - EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA, - nr_pages, &max_addr); - if (status != EFI_SUCCESS) { - max = max_addr; - max_addr = 0; - goto again; - } - - *addr = max_addr; - } - -free_pool: - efi_call_phys1(sys_table->boottime->free_pool, map); - -fail: - return status; -} - -/* - * Allocate at the lowest possible address. - */ -static efi_status_t low_alloc(unsigned long size, unsigned long align, - unsigned long *addr) -{ - unsigned long map_size, desc_size; - efi_memory_desc_t *map; - efi_status_t status; - unsigned long nr_pages; - int i; - - status = __get_map(&map, &map_size, &desc_size); - if (status != EFI_SUCCESS) - goto fail; - - nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; - for (i = 0; i < map_size / desc_size; i++) { - efi_memory_desc_t *desc; - unsigned long m = (unsigned long)map; - u64 start, end; - - desc = (efi_memory_desc_t *)(m + (i * desc_size)); - - if (desc->type != EFI_CONVENTIONAL_MEMORY) - continue; - - if (desc->num_pages < nr_pages) - continue; - - start = desc->phys_addr; - end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT); - - /* - * Don't allocate at 0x0. It will confuse code that - * checks pointers against NULL. Skip the first 8 - * bytes so we start at a nice even number. - */ - if (start == 0x0) - start += 8; - - start = round_up(start, align); - if ((start + size) > end) - continue; - - status = efi_call_phys4(sys_table->boottime->allocate_pages, - EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA, - nr_pages, &start); - if (status == EFI_SUCCESS) { - *addr = start; - break; - } - } - - if (i == map_size / desc_size) - status = EFI_NOT_FOUND; - -free_pool: - efi_call_phys1(sys_table->boottime->free_pool, map); -fail: - return status; -} - -static void low_free(unsigned long size, unsigned long addr) -{ - unsigned long nr_pages; - - nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; - efi_call_phys2(sys_table->boottime->free_pages, addr, nr_pages); -} static void find_bits(unsigned long mask, u8 *pos, u8 *size) { @@ -624,242 +420,6 @@ void setup_graphics(struct boot_params *boot_params) } } -struct initrd { - efi_file_handle_t *handle; - u64 size; -}; - -/* - * Check the cmdline for a LILO-style initrd= arguments. - * - * We only support loading an initrd from the same filesystem as the - * kernel image. - */ -static efi_status_t handle_ramdisks(efi_loaded_image_t *image, - struct setup_header *hdr) -{ - struct initrd *initrds; - unsigned long initrd_addr; - efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID; - u64 initrd_total; - efi_file_io_interface_t *io; - efi_file_handle_t *fh; - efi_status_t status; - int nr_initrds; - char *str; - int i, j, k; - - initrd_addr = 0; - initrd_total = 0; - - str = (char *)(unsigned long)hdr->cmd_line_ptr; - - j = 0; /* See close_handles */ - - if (!str || !*str) - return EFI_SUCCESS; - - for (nr_initrds = 0; *str; nr_initrds++) { - str = strstr(str, "initrd="); - if (!str) - break; - - str += 7; - - /* Skip any leading slashes */ - while (*str == '/' || *str == '\\') - str++; - - while (*str && *str != ' ' && *str != '\n') - str++; - } - - if (!nr_initrds) - return EFI_SUCCESS; - - status = efi_call_phys3(sys_table->boottime->allocate_pool, - EFI_LOADER_DATA, - nr_initrds * sizeof(*initrds), - &initrds); - if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc mem for initrds\n"); - goto fail; - } - - str = (char *)(unsigned long)hdr->cmd_line_ptr; - for (i = 0; i < nr_initrds; i++) { - struct initrd *initrd; - efi_file_handle_t *h; - efi_file_info_t *info; - efi_char16_t filename_16[256]; - unsigned long info_sz; - efi_guid_t info_guid = EFI_FILE_INFO_ID; - efi_char16_t *p; - u64 file_sz; - - str = strstr(str, "initrd="); - if (!str) - break; - - str += 7; - - initrd = &initrds[i]; - p = filename_16; - - /* Skip any leading slashes */ - while (*str == '/' || *str == '\\') - str++; - - while (*str && *str != ' ' && *str != '\n') { - if ((u8 *)p >= (u8 *)filename_16 + sizeof(filename_16)) - break; - - if (*str == '/') { - *p++ = '\\'; - *str++; - } else { - *p++ = *str++; - } - } - - *p = '\0'; - - /* Only open the volume once. */ - if (!i) { - efi_boot_services_t *boottime; - - boottime = sys_table->boottime; - - status = efi_call_phys3(boottime->handle_protocol, - image->device_handle, &fs_proto, &io); - if (status != EFI_SUCCESS) { - efi_printk("Failed to handle fs_proto\n"); - goto free_initrds; - } - - status = efi_call_phys2(io->open_volume, io, &fh); - if (status != EFI_SUCCESS) { - efi_printk("Failed to open volume\n"); - goto free_initrds; - } - } - - status = efi_call_phys5(fh->open, fh, &h, filename_16, - EFI_FILE_MODE_READ, (u64)0); - if (status != EFI_SUCCESS) { - efi_printk("Failed to open initrd file: "); - efi_char16_printk(filename_16); - efi_printk("\n"); - goto close_handles; - } - - initrd->handle = h; - - info_sz = 0; - status = efi_call_phys4(h->get_info, h, &info_guid, - &info_sz, NULL); - if (status != EFI_BUFFER_TOO_SMALL) { - efi_printk("Failed to get initrd info size\n"); - goto close_handles; - } - -grow: - status = efi_call_phys3(sys_table->boottime->allocate_pool, - EFI_LOADER_DATA, info_sz, &info); - if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc mem for initrd info\n"); - goto close_handles; - } - - status = efi_call_phys4(h->get_info, h, &info_guid, - &info_sz, info); - if (status == EFI_BUFFER_TOO_SMALL) { - efi_call_phys1(sys_table->boottime->free_pool, info); - goto grow; - } - - file_sz = info->file_size; - efi_call_phys1(sys_table->boottime->free_pool, info); - - if (status != EFI_SUCCESS) { - efi_printk("Failed to get initrd info\n"); - goto close_handles; - } - - initrd->size = file_sz; - initrd_total += file_sz; - } - - if (initrd_total) { - unsigned long addr; - - /* - * Multiple initrd's need to be at consecutive - * addresses in memory, so allocate enough memory for - * all the initrd's. - */ - status = high_alloc(initrd_total, 0x1000, - &initrd_addr, hdr->initrd_addr_max); - if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc highmem for initrds\n"); - goto close_handles; - } - - /* We've run out of free low memory. */ - if (initrd_addr > hdr->initrd_addr_max) { - efi_printk("We've run out of free low memory\n"); - status = EFI_INVALID_PARAMETER; - goto free_initrd_total; - } - - addr = initrd_addr; - for (j = 0; j < nr_initrds; j++) { - u64 size; - - size = initrds[j].size; - while (size) { - u64 chunksize; - if (size > EFI_READ_CHUNK_SIZE) - chunksize = EFI_READ_CHUNK_SIZE; - else - chunksize = size; - status = efi_call_phys3(fh->read, - initrds[j].handle, - &chunksize, addr); - if (status != EFI_SUCCESS) { - efi_printk("Failed to read initrd\n"); - goto free_initrd_total; - } - addr += chunksize; - size -= chunksize; - } - - efi_call_phys1(fh->close, initrds[j].handle); - } - - } - - efi_call_phys1(sys_table->boottime->free_pool, initrds); - - hdr->ramdisk_image = initrd_addr; - hdr->ramdisk_size = initrd_total; - - return status; - -free_initrd_total: - low_free(initrd_total, initrd_addr); - -close_handles: - for (k = j; k < i; k++) - efi_call_phys1(fh->close, initrds[k].handle); -free_initrds: - efi_call_phys1(sys_table->boottime->free_pool, initrds); -fail: - hdr->ramdisk_image = 0; - hdr->ramdisk_size = 0; - - return status; -} /* * Because the x86 boot code expects to be passed a boot_params we @@ -875,14 +435,15 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table) struct efi_info *efi; efi_loaded_image_t *image; void *options; - u32 load_options_size; efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID; int options_size = 0; efi_status_t status; - unsigned long cmdline; + char *cmdline_ptr; u16 *s2; u8 *s1; int i; + unsigned long ramdisk_addr; + unsigned long ramdisk_size; sys_table = _table; @@ -893,13 +454,14 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table) status = efi_call_phys3(sys_table->boottime->handle_protocol, handle, &proto, (void *)&image); if (status != EFI_SUCCESS) { - efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n"); + efi_printk(sys_table, "Failed to get handle for LOADED_IMAGE_PROTOCOL\n"); return NULL; } - status = low_alloc(0x4000, 1, (unsigned long *)&boot_params); + status = efi_low_alloc(sys_table, 0x4000, 1, + (unsigned long *)&boot_params); if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc lowmem for boot params\n"); + efi_printk(sys_table, "Failed to alloc lowmem for boot params\n"); return NULL; } @@ -926,40 +488,11 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table) hdr->type_of_loader = 0x21; /* Convert unicode cmdline to ascii */ - options = image->load_options; - load_options_size = image->load_options_size / 2; /* ASCII */ - cmdline = 0; - s2 = (u16 *)options; - - if (s2) { - while (*s2 && *s2 != '\n' && options_size < load_options_size) { - s2++; - options_size++; - } - - if (options_size) { - if (options_size > hdr->cmdline_size) - options_size = hdr->cmdline_size; - - options_size++; /* NUL termination */ - - status = low_alloc(options_size, 1, &cmdline); - if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc mem for cmdline\n"); - goto fail; - } - - s1 = (u8 *)(unsigned long)cmdline; - s2 = (u16 *)options; - - for (i = 0; i < options_size - 1; i++) - *s1++ = *s2++; - - *s1 = '\0'; - } - } - - hdr->cmd_line_ptr = cmdline; + cmdline_ptr = efi_convert_cmdline_to_ascii(sys_table, image, + &options_size); + if (!cmdline_ptr) + goto fail; + hdr->cmd_line_ptr = (unsigned long)cmdline_ptr; hdr->ramdisk_image = 0; hdr->ramdisk_size = 0; @@ -969,96 +502,64 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table) memset(sdt, 0, sizeof(*sdt)); - status = handle_ramdisks(image, hdr); + status = handle_cmdline_files(sys_table, image, + (char *)(unsigned long)hdr->cmd_line_ptr, + "initrd=", hdr->initrd_addr_max, + &ramdisk_addr, &ramdisk_size); if (status != EFI_SUCCESS) goto fail2; + hdr->ramdisk_image = ramdisk_addr; + hdr->ramdisk_size = ramdisk_size; return boot_params; fail2: - if (options_size) - low_free(options_size, hdr->cmd_line_ptr); + efi_free(sys_table, options_size, hdr->cmd_line_ptr); fail: - low_free(0x4000, (unsigned long)boot_params); + efi_free(sys_table, 0x4000, (unsigned long)boot_params); return NULL; } -static efi_status_t exit_boot(struct boot_params *boot_params, - void *handle) +static void add_e820ext(struct boot_params *params, + struct setup_data *e820ext, u32 nr_entries) { - struct efi_info *efi = &boot_params->efi_info; - struct e820entry *e820_map = &boot_params->e820_map[0]; - struct e820entry *prev = NULL; - unsigned long size, key, desc_size, _size; - efi_memory_desc_t *mem_map; + struct setup_data *data; efi_status_t status; - __u32 desc_version; - bool called_exit = false; - u8 nr_entries; - int i; - - size = sizeof(*mem_map) * 32; - -again: - size += sizeof(*mem_map) * 2; - _size = size; - status = low_alloc(size, 1, (unsigned long *)&mem_map); - if (status != EFI_SUCCESS) - return status; - -get_map: - status = efi_call_phys5(sys_table->boottime->get_memory_map, &size, - mem_map, &key, &desc_size, &desc_version); - if (status == EFI_BUFFER_TOO_SMALL) { - low_free(_size, (unsigned long)mem_map); - goto again; - } + unsigned long size; - if (status != EFI_SUCCESS) - goto free_mem_map; + e820ext->type = SETUP_E820_EXT; + e820ext->len = nr_entries * sizeof(struct e820entry); + e820ext->next = 0; - memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32)); - efi->efi_systab = (unsigned long)sys_table; - efi->efi_memdesc_size = desc_size; - efi->efi_memdesc_version = desc_version; - efi->efi_memmap = (unsigned long)mem_map; - efi->efi_memmap_size = size; - -#ifdef CONFIG_X86_64 - efi->efi_systab_hi = (unsigned long)sys_table >> 32; - efi->efi_memmap_hi = (unsigned long)mem_map >> 32; -#endif + data = (struct setup_data *)(unsigned long)params->hdr.setup_data; - /* Might as well exit boot services now */ - status = efi_call_phys2(sys_table->boottime->exit_boot_services, - handle, key); - if (status != EFI_SUCCESS) { - /* - * ExitBootServices() will fail if any of the event - * handlers change the memory map. In which case, we - * must be prepared to retry, but only once so that - * we're guaranteed to exit on repeated failures instead - * of spinning forever. - */ - if (called_exit) - goto free_mem_map; + while (data && data->next) + data = (struct setup_data *)(unsigned long)data->next; - called_exit = true; - goto get_map; - } + if (data) + data->next = (unsigned long)e820ext; + else + params->hdr.setup_data = (unsigned long)e820ext; +} - /* Historic? */ - boot_params->alt_mem_k = 32 * 1024; +static efi_status_t setup_e820(struct boot_params *params, + struct setup_data *e820ext, u32 e820ext_size) +{ + struct e820entry *e820_map = ¶ms->e820_map[0]; + struct efi_info *efi = ¶ms->efi_info; + struct e820entry *prev = NULL; + u32 nr_entries; + u32 nr_desc; + int i; - /* - * Convert the EFI memory map to E820. - */ nr_entries = 0; - for (i = 0; i < size / desc_size; i++) { + nr_desc = efi->efi_memmap_size / efi->efi_memdesc_size; + + for (i = 0; i < nr_desc; i++) { efi_memory_desc_t *d; unsigned int e820_type = 0; - unsigned long m = (unsigned long)mem_map; + unsigned long m = efi->efi_memmap; - d = (efi_memory_desc_t *)(m + (i * desc_size)); + d = (efi_memory_desc_t *)(m + (i * efi->efi_memdesc_size)); switch (d->type) { case EFI_RESERVED_TYPE: case EFI_RUNTIME_SERVICES_CODE: @@ -1095,61 +596,151 @@ get_map: /* Merge adjacent mappings */ if (prev && prev->type == e820_type && - (prev->addr + prev->size) == d->phys_addr) + (prev->addr + prev->size) == d->phys_addr) { prev->size += d->num_pages << 12; - else { - e820_map->addr = d->phys_addr; - e820_map->size = d->num_pages << 12; - e820_map->type = e820_type; - prev = e820_map++; - nr_entries++; + continue; + } + + if (nr_entries == ARRAY_SIZE(params->e820_map)) { + u32 need = (nr_desc - i) * sizeof(struct e820entry) + + sizeof(struct setup_data); + + if (!e820ext || e820ext_size < need) + return EFI_BUFFER_TOO_SMALL; + + /* boot_params map full, switch to e820 extended */ + e820_map = (struct e820entry *)e820ext->data; } + + e820_map->addr = d->phys_addr; + e820_map->size = d->num_pages << PAGE_SHIFT; + e820_map->type = e820_type; + prev = e820_map++; + nr_entries++; } - boot_params->e820_entries = nr_entries; + if (nr_entries > ARRAY_SIZE(params->e820_map)) { + u32 nr_e820ext = nr_entries - ARRAY_SIZE(params->e820_map); + + add_e820ext(params, e820ext, nr_e820ext); + nr_entries -= nr_e820ext; + } + + params->e820_entries = (u8)nr_entries; return EFI_SUCCESS; +} + +static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext, + u32 *e820ext_size) +{ + efi_status_t status; + unsigned long size; + + size = sizeof(struct setup_data) + + sizeof(struct e820entry) * nr_desc; + + if (*e820ext) { + efi_call_phys1(sys_table->boottime->free_pool, *e820ext); + *e820ext = NULL; + *e820ext_size = 0; + } + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, size, e820ext); + + if (status == EFI_SUCCESS) + *e820ext_size = size; -free_mem_map: - low_free(_size, (unsigned long)mem_map); return status; } -static efi_status_t relocate_kernel(struct setup_header *hdr) +static efi_status_t exit_boot(struct boot_params *boot_params, + void *handle) { - unsigned long start, nr_pages; + struct efi_info *efi = &boot_params->efi_info; + unsigned long map_sz, key, desc_size; + efi_memory_desc_t *mem_map; + struct setup_data *e820ext; + __u32 e820ext_size; + __u32 nr_desc, prev_nr_desc; efi_status_t status; + __u32 desc_version; + bool called_exit = false; + u8 nr_entries; + int i; - /* - * The EFI firmware loader could have placed the kernel image - * anywhere in memory, but the kernel has various restrictions - * on the max physical address it can run at. Attempt to move - * the kernel to boot_params.pref_address, or as low as - * possible. - */ - start = hdr->pref_address; - nr_pages = round_up(hdr->init_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; + nr_desc = 0; + e820ext = NULL; + e820ext_size = 0; - status = efi_call_phys4(sys_table->boottime->allocate_pages, - EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA, - nr_pages, &start); - if (status != EFI_SUCCESS) { - status = low_alloc(hdr->init_size, hdr->kernel_alignment, - &start); +get_map: + status = efi_get_memory_map(sys_table, &mem_map, &map_sz, &desc_size, + &desc_version, &key); + + if (status != EFI_SUCCESS) + return status; + + prev_nr_desc = nr_desc; + nr_desc = map_sz / desc_size; + if (nr_desc > prev_nr_desc && + nr_desc > ARRAY_SIZE(boot_params->e820_map)) { + u32 nr_e820ext = nr_desc - ARRAY_SIZE(boot_params->e820_map); + + status = alloc_e820ext(nr_e820ext, &e820ext, &e820ext_size); if (status != EFI_SUCCESS) - efi_printk("Failed to alloc mem for kernel\n"); + goto free_mem_map; + + efi_call_phys1(sys_table->boottime->free_pool, mem_map); + goto get_map; /* Allocated memory, get map again */ } - if (status == EFI_SUCCESS) - memcpy((void *)start, (void *)(unsigned long)hdr->code32_start, - hdr->init_size); + memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32)); + efi->efi_systab = (unsigned long)sys_table; + efi->efi_memdesc_size = desc_size; + efi->efi_memdesc_version = desc_version; + efi->efi_memmap = (unsigned long)mem_map; + efi->efi_memmap_size = map_sz; + +#ifdef CONFIG_X86_64 + efi->efi_systab_hi = (unsigned long)sys_table >> 32; + efi->efi_memmap_hi = (unsigned long)mem_map >> 32; +#endif - hdr->pref_address = hdr->code32_start; - hdr->code32_start = (__u32)start; + /* Might as well exit boot services now */ + status = efi_call_phys2(sys_table->boottime->exit_boot_services, + handle, key); + if (status != EFI_SUCCESS) { + /* + * ExitBootServices() will fail if any of the event + * handlers change the memory map. In which case, we + * must be prepared to retry, but only once so that + * we're guaranteed to exit on repeated failures instead + * of spinning forever. + */ + if (called_exit) + goto free_mem_map; + called_exit = true; + efi_call_phys1(sys_table->boottime->free_pool, mem_map); + goto get_map; + } + + /* Historic? */ + boot_params->alt_mem_k = 32 * 1024; + + status = setup_e820(boot_params, e820ext, e820ext_size); + if (status != EFI_SUCCESS) + return status; + + return EFI_SUCCESS; + +free_mem_map: + efi_call_phys1(sys_table->boottime->free_pool, mem_map); return status; } + /* * On success we return a pointer to a boot_params structure, and NULL * on failure. @@ -1157,7 +748,7 @@ static efi_status_t relocate_kernel(struct setup_header *hdr) struct boot_params *efi_main(void *handle, efi_system_table_t *_table, struct boot_params *boot_params) { - struct desc_ptr *gdt, *idt; + struct desc_ptr *gdt; efi_loaded_image_t *image; struct setup_header *hdr = &boot_params->hdr; efi_status_t status; @@ -1177,37 +768,33 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table, EFI_LOADER_DATA, sizeof(*gdt), (void **)&gdt); if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc mem for gdt structure\n"); + efi_printk(sys_table, "Failed to alloc mem for gdt structure\n"); goto fail; } gdt->size = 0x800; - status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address); - if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc mem for gdt\n"); - goto fail; - } - - status = efi_call_phys3(sys_table->boottime->allocate_pool, - EFI_LOADER_DATA, sizeof(*idt), - (void **)&idt); + status = efi_low_alloc(sys_table, gdt->size, 8, + (unsigned long *)&gdt->address); if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc mem for idt structure\n"); + efi_printk(sys_table, "Failed to alloc mem for gdt\n"); goto fail; } - idt->size = 0; - idt->address = 0; - /* * If the kernel isn't already loaded at the preferred load * address, relocate it. */ if (hdr->pref_address != hdr->code32_start) { - status = relocate_kernel(hdr); - + unsigned long bzimage_addr = hdr->code32_start; + status = efi_relocate_kernel(sys_table, &bzimage_addr, + hdr->init_size, hdr->init_size, + hdr->pref_address, + hdr->kernel_alignment); if (status != EFI_SUCCESS) goto fail; + + hdr->pref_address = hdr->code32_start; + hdr->code32_start = bzimage_addr; } status = exit_boot(boot_params, handle); @@ -1267,10 +854,8 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table, desc->base2 = 0x00; #endif /* CONFIG_X86_64 */ - asm volatile ("lidt %0" : : "m" (*idt)); - asm volatile ("lgdt %0" : : "m" (*gdt)); - asm volatile("cli"); + asm volatile ("lgdt %0" : : "m" (*gdt)); return boot_params; fail: diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index e5b0a8f91c5..81b6b652b46 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -11,9 +11,6 @@ #define DESC_TYPE_CODE_DATA (1 << 0) -#define EFI_PAGE_SIZE (1UL << EFI_PAGE_SHIFT) -#define EFI_READ_CHUNK_SIZE (1024 * 1024) - #define EFI_CONSOLE_OUT_DEVICE_GUID \ EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, 0x9a, 0x46, 0x0, 0x90, 0x27, \ 0x3f, 0xc1, 0x4d) @@ -62,10 +59,4 @@ struct efi_uga_draw_protocol { void *blt; }; -struct efi_simple_text_output_protocol { - void *reset; - void *output_string; - void *test_string; -}; - #endif /* BOOT_COMPRESSED_EBOOT_H */ diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 1e3184f6072..5d6f6891b18 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -181,8 +181,9 @@ relocated: /* * Do the decompression, and jump to the new kernel.. */ - leal z_extract_offset_negative(%ebx), %ebp /* push arguments for decompress_kernel: */ + pushl $z_output_len /* decompressed length */ + leal z_extract_offset_negative(%ebx), %ebp pushl %ebp /* output address */ pushl $z_input_len /* input_len */ leal input_data(%ebx), %eax @@ -191,33 +192,7 @@ relocated: pushl %eax /* heap area */ pushl %esi /* real mode pointer */ call decompress_kernel - addl $20, %esp - -#if CONFIG_RELOCATABLE -/* - * Find the address of the relocations. - */ - leal z_output_len(%ebp), %edi - -/* - * Calculate the delta between where vmlinux was compiled to run - * and where it was actually loaded. - */ - movl %ebp, %ebx - subl $LOAD_PHYSICAL_ADDR, %ebx - jz 2f /* Nothing to be done if loaded at compiled addr. */ -/* - * Process relocations. - */ - -1: subl $4, %edi - movl (%edi), %ecx - testl %ecx, %ecx - jz 2f - addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) - jmp 1b -2: -#endif + addl $24, %esp /* * Jump to the decompressed kernel. diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 06e71c2c16b..c337422b575 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -338,6 +338,7 @@ relocated: leaq input_data(%rip), %rdx /* input_data */ movl $z_input_len, %ecx /* input_len */ movq %rbp, %r8 /* output target address */ + movq $z_output_len, %r9 /* decompressed length */ call decompress_kernel popq %rsi diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 0319c88290a..434f077d2c4 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -271,6 +271,79 @@ static void error(char *x) asm("hlt"); } +#if CONFIG_X86_NEED_RELOCS +static void handle_relocations(void *output, unsigned long output_len) +{ + int *reloc; + unsigned long delta, map, ptr; + unsigned long min_addr = (unsigned long)output; + unsigned long max_addr = min_addr + output_len; + + /* + * Calculate the delta between where vmlinux was linked to load + * and where it was actually loaded. + */ + delta = min_addr - LOAD_PHYSICAL_ADDR; + if (!delta) { + debug_putstr("No relocation needed... "); + return; + } + debug_putstr("Performing relocations... "); + + /* + * The kernel contains a table of relocation addresses. Those + * addresses have the final load address of the kernel in virtual + * memory. We are currently working in the self map. So we need to + * create an adjustment for kernel memory addresses to the self map. + * This will involve subtracting out the base address of the kernel. + */ + map = delta - __START_KERNEL_map; + + /* + * Process relocations: 32 bit relocations first then 64 bit after. + * Two sets of binary relocations are added to the end of the kernel + * before compression. Each relocation table entry is the kernel + * address of the location which needs to be updated stored as a + * 32-bit value which is sign extended to 64 bits. + * + * Format is: + * + * kernel bits... + * 0 - zero terminator for 64 bit relocations + * 64 bit relocation repeated + * 0 - zero terminator for 32 bit relocations + * 32 bit relocation repeated + * + * So we work backwards from the end of the decompressed image. + */ + for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) { + int extended = *reloc; + extended += map; + + ptr = (unsigned long)extended; + if (ptr < min_addr || ptr > max_addr) + error("32-bit relocation outside of kernel!\n"); + + *(uint32_t *)ptr += delta; + } +#ifdef CONFIG_X86_64 + for (reloc--; *reloc; reloc--) { + long extended = *reloc; + extended += map; + + ptr = (unsigned long)extended; + if (ptr < min_addr || ptr > max_addr) + error("64-bit relocation outside of kernel!\n"); + + *(uint64_t *)ptr += delta; + } +#endif +} +#else +static inline void handle_relocations(void *output, unsigned long output_len) +{ } +#endif + static void parse_elf(void *output) { #ifdef CONFIG_X86_64 @@ -325,7 +398,8 @@ static void parse_elf(void *output) asmlinkage void decompress_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, - unsigned char *output) + unsigned char *output, + unsigned long output_len) { real_mode = rmode; @@ -365,6 +439,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, debug_putstr("\nDecompressing Linux... "); decompress(input_data, input_len, NULL, NULL, output, NULL, error); parse_elf(output); + handle_relocations(output, output_len); debug_putstr("done.\nBooting the kernel.\n"); return; } diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index 958a641483d..b669ab65bf6 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -36,11 +36,12 @@ int main(int argc, char *argv[]) uint32_t olen; long ilen; unsigned long offs; - FILE *f; + FILE *f = NULL; + int retval = 1; if (argc < 2) { fprintf(stderr, "Usage: %s compressed_file\n", argv[0]); - return 1; + goto bail; } /* Get the information for the compressed kernel image first */ @@ -48,7 +49,7 @@ int main(int argc, char *argv[]) f = fopen(argv[1], "r"); if (!f) { perror(argv[1]); - return 1; + goto bail; } @@ -58,12 +59,11 @@ int main(int argc, char *argv[]) if (fread(&olen, sizeof(olen), 1, f) != 1) { perror(argv[1]); - return 1; + goto bail; } ilen = ftell(f); olen = get_unaligned_le32(&olen); - fclose(f); /* * Now we have the input (compressed) and output (uncompressed) @@ -91,5 +91,9 @@ int main(int argc, char *argv[]) printf(".incbin \"%s\"\n", argv[1]); printf("input_data_end:\n"); - return 0; + retval = 0; +bail: + if (f) + fclose(f); + return retval; } diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c index cdac91ca55d..565083c16e5 100644 --- a/arch/x86/boot/printf.c +++ b/arch/x86/boot/printf.c @@ -55,7 +55,7 @@ static char *number(char *str, long num, int base, int size, int precision, locase = (type & SMALL); if (type & LEFT) type &= ~ZEROPAD; - if (base < 2 || base > 36) + if (base < 2 || base > 16) return NULL; c = (type & ZEROPAD) ? '0' : ' '; sign = 0; diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index c941d6a8887..8e15b22391f 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -5,14 +5,15 @@ */ /* - * This file builds a disk-image from two different files: + * This file builds a disk-image from three different files: * * - setup: 8086 machine code, sets up system parm * - system: 80386 code for actual system + * - zoffset.h: header with ZO_* defines * - * It does some checking that all files are of the correct type, and - * just writes the result to stdout, removing headers and padding to - * the right amount. It also writes some system data to stderr. + * It does some checking that all files are of the correct type, and writes + * the result to the specified destination, removing headers and padding to + * the right amount. It also writes some system data to stdout. */ /* @@ -136,7 +137,7 @@ static void die(const char * str, ...) static void usage(void) { - die("Usage: build setup system [zoffset.h] [> image]"); + die("Usage: build setup system zoffset.h image"); } #ifdef CONFIG_EFI_STUB @@ -265,7 +266,7 @@ int main(int argc, char ** argv) int c; u32 sys_size; struct stat sb; - FILE *file; + FILE *file, *dest; int fd; void *kernel; u32 crc = 0xffffffffUL; @@ -280,10 +281,13 @@ int main(int argc, char ** argv) startup_64 = 0x200; #endif - if (argc == 4) - parse_zoffset(argv[3]); - else if (argc != 3) + if (argc != 5) usage(); + parse_zoffset(argv[3]); + + dest = fopen(argv[4], "w"); + if (!dest) + die("Unable to write `%s': %m", argv[4]); /* Copy the setup code */ file = fopen(argv[1], "r"); @@ -318,7 +322,7 @@ int main(int argc, char ** argv) /* Set the default root device */ put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]); - fprintf(stderr, "Setup is %d bytes (padded to %d bytes).\n", c, i); + printf("Setup is %d bytes (padded to %d bytes).\n", c, i); /* Open and stat the kernel file */ fd = open(argv[2], O_RDONLY); @@ -327,7 +331,7 @@ int main(int argc, char ** argv) if (fstat(fd, &sb)) die("Unable to stat `%s': %m", argv[2]); sz = sb.st_size; - fprintf (stderr, "System is %d kB\n", (sz+1023)/1024); + printf("System is %d kB\n", (sz+1023)/1024); kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0); if (kernel == MAP_FAILED) die("Unable to mmap '%s': %m", argv[2]); @@ -348,27 +352,31 @@ int main(int argc, char ** argv) #endif crc = partial_crc32(buf, i, crc); - if (fwrite(buf, 1, i, stdout) != i) + if (fwrite(buf, 1, i, dest) != i) die("Writing setup failed"); /* Copy the kernel code */ crc = partial_crc32(kernel, sz, crc); - if (fwrite(kernel, 1, sz, stdout) != sz) + if (fwrite(kernel, 1, sz, dest) != sz) die("Writing kernel failed"); /* Add padding leaving 4 bytes for the checksum */ while (sz++ < (sys_size*16) - 4) { crc = partial_crc32_one('\0', crc); - if (fwrite("\0", 1, 1, stdout) != 1) + if (fwrite("\0", 1, 1, dest) != 1) die("Writing padding failed"); } /* Write the CRC */ - fprintf(stderr, "CRC %x\n", crc); + printf("CRC %x\n", crc); put_unaligned_le32(crc, buf); - if (fwrite(buf, 1, 4, stdout) != 4) + if (fwrite(buf, 1, 4, dest) != 4) die("Writing CRC failed"); + /* Catch any delayed write failures */ + if (fclose(dest)) + die("Writing image failed"); + close(fd); /* Everything is OK */ diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 94447086e55..a7fef2621cc 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -142,6 +142,8 @@ CONFIG_MAC80211=y CONFIG_MAC80211_LEDS=y CONFIG_RFKILL=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEBUG_DEVRES=y CONFIG_CONNECTOR=y CONFIG_BLK_DEV_LOOP=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 671524d0f6c..c1119d4c128 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -141,6 +141,8 @@ CONFIG_MAC80211=y CONFIG_MAC80211_LEDS=y CONFIG_RFKILL=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEBUG_DEVRES=y CONFIG_CONNECTOR=y CONFIG_BLK_DEV_LOOP=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 6c63c358a7e..7d6ba9db1be 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o +obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) @@ -81,3 +82,4 @@ crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o +crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 5cb86ccd4ac..c171dcbf192 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -62,7 +62,7 @@ static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) } /* camellia sboxes */ -const u64 camellia_sp10011110[256] = { +__visible const u64 camellia_sp10011110[256] = { 0x7000007070707000ULL, 0x8200008282828200ULL, 0x2c00002c2c2c2c00ULL, 0xec0000ecececec00ULL, 0xb30000b3b3b3b300ULL, 0x2700002727272700ULL, 0xc00000c0c0c0c000ULL, 0xe50000e5e5e5e500ULL, 0xe40000e4e4e4e400ULL, @@ -151,7 +151,7 @@ const u64 camellia_sp10011110[256] = { 0x9e00009e9e9e9e00ULL, }; -const u64 camellia_sp22000222[256] = { +__visible const u64 camellia_sp22000222[256] = { 0xe0e0000000e0e0e0ULL, 0x0505000000050505ULL, 0x5858000000585858ULL, 0xd9d9000000d9d9d9ULL, 0x6767000000676767ULL, 0x4e4e0000004e4e4eULL, 0x8181000000818181ULL, 0xcbcb000000cbcbcbULL, 0xc9c9000000c9c9c9ULL, @@ -240,7 +240,7 @@ const u64 camellia_sp22000222[256] = { 0x3d3d0000003d3d3dULL, }; -const u64 camellia_sp03303033[256] = { +__visible const u64 camellia_sp03303033[256] = { 0x0038380038003838ULL, 0x0041410041004141ULL, 0x0016160016001616ULL, 0x0076760076007676ULL, 0x00d9d900d900d9d9ULL, 0x0093930093009393ULL, 0x0060600060006060ULL, 0x00f2f200f200f2f2ULL, 0x0072720072007272ULL, @@ -329,7 +329,7 @@ const u64 camellia_sp03303033[256] = { 0x004f4f004f004f4fULL, }; -const u64 camellia_sp00444404[256] = { +__visible const u64 camellia_sp00444404[256] = { 0x0000707070700070ULL, 0x00002c2c2c2c002cULL, 0x0000b3b3b3b300b3ULL, 0x0000c0c0c0c000c0ULL, 0x0000e4e4e4e400e4ULL, 0x0000575757570057ULL, 0x0000eaeaeaea00eaULL, 0x0000aeaeaeae00aeULL, 0x0000232323230023ULL, @@ -418,7 +418,7 @@ const u64 camellia_sp00444404[256] = { 0x00009e9e9e9e009eULL, }; -const u64 camellia_sp02220222[256] = { +__visible const u64 camellia_sp02220222[256] = { 0x00e0e0e000e0e0e0ULL, 0x0005050500050505ULL, 0x0058585800585858ULL, 0x00d9d9d900d9d9d9ULL, 0x0067676700676767ULL, 0x004e4e4e004e4e4eULL, 0x0081818100818181ULL, 0x00cbcbcb00cbcbcbULL, 0x00c9c9c900c9c9c9ULL, @@ -507,7 +507,7 @@ const u64 camellia_sp02220222[256] = { 0x003d3d3d003d3d3dULL, }; -const u64 camellia_sp30333033[256] = { +__visible const u64 camellia_sp30333033[256] = { 0x3800383838003838ULL, 0x4100414141004141ULL, 0x1600161616001616ULL, 0x7600767676007676ULL, 0xd900d9d9d900d9d9ULL, 0x9300939393009393ULL, 0x6000606060006060ULL, 0xf200f2f2f200f2f2ULL, 0x7200727272007272ULL, @@ -596,7 +596,7 @@ const u64 camellia_sp30333033[256] = { 0x4f004f4f4f004f4fULL, }; -const u64 camellia_sp44044404[256] = { +__visible const u64 camellia_sp44044404[256] = { 0x7070007070700070ULL, 0x2c2c002c2c2c002cULL, 0xb3b300b3b3b300b3ULL, 0xc0c000c0c0c000c0ULL, 0xe4e400e4e4e400e4ULL, 0x5757005757570057ULL, 0xeaea00eaeaea00eaULL, 0xaeae00aeaeae00aeULL, 0x2323002323230023ULL, @@ -685,7 +685,7 @@ const u64 camellia_sp44044404[256] = { 0x9e9e009e9e9e009eULL, }; -const u64 camellia_sp11101110[256] = { +__visible const u64 camellia_sp11101110[256] = { 0x7070700070707000ULL, 0x8282820082828200ULL, 0x2c2c2c002c2c2c00ULL, 0xececec00ececec00ULL, 0xb3b3b300b3b3b300ULL, 0x2727270027272700ULL, 0xc0c0c000c0c0c000ULL, 0xe5e5e500e5e5e500ULL, 0xe4e4e400e4e4e400ULL, @@ -828,8 +828,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) subRL[1] ^= (subRL[1] & ~subRL[9]) << 32; /* modified for FLinv(kl2) */ - dw = (subRL[1] & subRL[9]) >> 32, - subRL[1] ^= rol32(dw, 1); + dw = (subRL[1] & subRL[9]) >> 32; + subRL[1] ^= rol32(dw, 1); /* round 8 */ subRL[11] ^= subRL[1]; @@ -840,8 +840,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) subRL[1] ^= (subRL[1] & ~subRL[17]) << 32; /* modified for FLinv(kl4) */ - dw = (subRL[1] & subRL[17]) >> 32, - subRL[1] ^= rol32(dw, 1); + dw = (subRL[1] & subRL[17]) >> 32; + subRL[1] ^= rol32(dw, 1); /* round 14 */ subRL[19] ^= subRL[1]; @@ -859,8 +859,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) } else { subRL[1] ^= (subRL[1] & ~subRL[25]) << 32; /* modified for FLinv(kl6) */ - dw = (subRL[1] & subRL[25]) >> 32, - subRL[1] ^= rol32(dw, 1); + dw = (subRL[1] & subRL[25]) >> 32; + subRL[1] ^= rol32(dw, 1); /* round 20 */ subRL[27] ^= subRL[1]; @@ -882,8 +882,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) kw4 ^= (kw4 & ~subRL[24]) << 32; /* modified for FL(kl5) */ - dw = (kw4 & subRL[24]) >> 32, - kw4 ^= rol32(dw, 1); + dw = (kw4 & subRL[24]) >> 32; + kw4 ^= rol32(dw, 1); } /* round 17 */ @@ -895,8 +895,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) kw4 ^= (kw4 & ~subRL[16]) << 32; /* modified for FL(kl3) */ - dw = (kw4 & subRL[16]) >> 32, - kw4 ^= rol32(dw, 1); + dw = (kw4 & subRL[16]) >> 32; + kw4 ^= rol32(dw, 1); /* round 11 */ subRL[14] ^= kw4; @@ -907,8 +907,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) kw4 ^= (kw4 & ~subRL[8]) << 32; /* modified for FL(kl1) */ - dw = (kw4 & subRL[8]) >> 32, - kw4 ^= rol32(dw, 1); + dw = (kw4 & subRL[8]) >> 32; + kw4 ^= rol32(dw, 1); /* round 5 */ subRL[6] ^= kw4; @@ -928,8 +928,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(6, subRL[5] ^ subRL[7]); /* round 5 */ tl = (subRL[10] >> 32) ^ (subRL[10] & ~subRL[8]); - dw = tl & (subRL[8] >> 32), /* FL(kl1) */ - tr = subRL[10] ^ rol32(dw, 1); + dw = tl & (subRL[8] >> 32); /* FL(kl1) */ + tr = subRL[10] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(7, subRL[6] ^ tt); /* round 6 */ @@ -937,8 +937,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(9, subRL[9]); /* FLinv(kl2) */ tl = (subRL[7] >> 32) ^ (subRL[7] & ~subRL[9]); - dw = tl & (subRL[9] >> 32), /* FLinv(kl2) */ - tr = subRL[7] ^ rol32(dw, 1); + dw = tl & (subRL[9] >> 32); /* FLinv(kl2) */ + tr = subRL[7] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(10, subRL[11] ^ tt); /* round 7 */ @@ -948,8 +948,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(14, subRL[13] ^ subRL[15]); /* round 11 */ tl = (subRL[18] >> 32) ^ (subRL[18] & ~subRL[16]); - dw = tl & (subRL[16] >> 32), /* FL(kl3) */ - tr = subRL[18] ^ rol32(dw, 1); + dw = tl & (subRL[16] >> 32); /* FL(kl3) */ + tr = subRL[18] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(15, subRL[14] ^ tt); /* round 12 */ @@ -957,8 +957,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(17, subRL[17]); /* FLinv(kl4) */ tl = (subRL[15] >> 32) ^ (subRL[15] & ~subRL[17]); - dw = tl & (subRL[17] >> 32), /* FLinv(kl4) */ - tr = subRL[15] ^ rol32(dw, 1); + dw = tl & (subRL[17] >> 32); /* FLinv(kl4) */ + tr = subRL[15] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(18, subRL[19] ^ tt); /* round 13 */ @@ -972,8 +972,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(24, subRL[24] ^ subRL[23]); /* kw3 */ } else { tl = (subRL[26] >> 32) ^ (subRL[26] & ~subRL[24]); - dw = tl & (subRL[24] >> 32), /* FL(kl5) */ - tr = subRL[26] ^ rol32(dw, 1); + dw = tl & (subRL[24] >> 32); /* FL(kl5) */ + tr = subRL[26] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(23, subRL[22] ^ tt); /* round 18 */ @@ -981,8 +981,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(25, subRL[25]); /* FLinv(kl6) */ tl = (subRL[23] >> 32) ^ (subRL[23] & ~subRL[25]); - dw = tl & (subRL[25] >> 32), /* FLinv(kl6) */ - tr = subRL[23] ^ rol32(dw, 1); + dw = tl & (subRL[25] >> 32); /* FLinv(kl6) */ + tr = subRL[23] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(26, subRL[27] ^ tt); /* round 19 */ diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S new file mode 100644 index 00000000000..35e97569d05 --- /dev/null +++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S @@ -0,0 +1,643 @@ +######################################################################## +# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +# +# Copyright (c) 2013, Intel Corporation +# +# Authors: +# Erdinc Ozturk <erdinc.ozturk@intel.com> +# Vinodh Gopal <vinodh.gopal@intel.com> +# James Guilford <james.guilford@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## +# Function API: +# UINT16 crc_t10dif_pcl( +# UINT16 init_crc, //initial CRC value, 16 bits +# const unsigned char *buf, //buffer pointer to calculate CRC on +# UINT64 len //buffer length in bytes (64-bit data) +# ); +# +# Reference paper titled "Fast CRC Computation for Generic +# Polynomials Using PCLMULQDQ Instruction" +# URL: http://www.intel.com/content/dam/www/public/us/en/documents +# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +# +# + +#include <linux/linkage.h> + +.text + +#define arg1 %rdi +#define arg2 %rsi +#define arg3 %rdx + +#define arg1_low32 %edi + +ENTRY(crc_t10dif_pcl) +.align 16 + + # adjust the 16-bit initial_crc value, scale it to 32 bits + shl $16, arg1_low32 + + # Allocate Stack Space + mov %rsp, %rcx + sub $16*2, %rsp + # align stack to 16 byte boundary + and $~(0x10 - 1), %rsp + + # check if smaller than 256 + cmp $256, arg3 + + # for sizes less than 128, we can't fold 64B at a time... + jl _less_than_128 + + + # load the initial crc value + movd arg1_low32, %xmm10 # initial crc + + # crc value does not need to be byte-reflected, but it needs + # to be moved to the high part of the register. + # because data will be byte-reflected and will align with + # initial crc at correct place. + pslldq $12, %xmm10 + + movdqa SHUF_MASK(%rip), %xmm11 + # receive the initial 64B data, xor the initial crc value + movdqu 16*0(arg2), %xmm0 + movdqu 16*1(arg2), %xmm1 + movdqu 16*2(arg2), %xmm2 + movdqu 16*3(arg2), %xmm3 + movdqu 16*4(arg2), %xmm4 + movdqu 16*5(arg2), %xmm5 + movdqu 16*6(arg2), %xmm6 + movdqu 16*7(arg2), %xmm7 + + pshufb %xmm11, %xmm0 + # XOR the initial_crc value + pxor %xmm10, %xmm0 + pshufb %xmm11, %xmm1 + pshufb %xmm11, %xmm2 + pshufb %xmm11, %xmm3 + pshufb %xmm11, %xmm4 + pshufb %xmm11, %xmm5 + pshufb %xmm11, %xmm6 + pshufb %xmm11, %xmm7 + + movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4 + #imm value of pclmulqdq instruction + #will determine which constant to use + + ################################################################# + # we subtract 256 instead of 128 to save one instruction from the loop + sub $256, arg3 + + # at this section of the code, there is 64*x+y (0<=y<64) bytes of + # buffer. The _fold_64_B_loop will fold 64B at a time + # until we have 64+y Bytes of buffer + + + # fold 64B at a time. This section of the code folds 4 xmm + # registers in parallel +_fold_64_B_loop: + + # update the buffer pointer + add $128, arg2 # buf += 64# + + movdqu 16*0(arg2), %xmm9 + movdqu 16*1(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm0, %xmm8 + movdqa %xmm1, %xmm13 + pclmulqdq $0x0 , %xmm10, %xmm0 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0 , %xmm10, %xmm1 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm0 + xorps %xmm8 , %xmm0 + pxor %xmm12, %xmm1 + xorps %xmm13, %xmm1 + + movdqu 16*2(arg2), %xmm9 + movdqu 16*3(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm2, %xmm8 + movdqa %xmm3, %xmm13 + pclmulqdq $0x0, %xmm10, %xmm2 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0, %xmm10, %xmm3 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm2 + xorps %xmm8 , %xmm2 + pxor %xmm12, %xmm3 + xorps %xmm13, %xmm3 + + movdqu 16*4(arg2), %xmm9 + movdqu 16*5(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm4, %xmm8 + movdqa %xmm5, %xmm13 + pclmulqdq $0x0, %xmm10, %xmm4 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0, %xmm10, %xmm5 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm4 + xorps %xmm8 , %xmm4 + pxor %xmm12, %xmm5 + xorps %xmm13, %xmm5 + + movdqu 16*6(arg2), %xmm9 + movdqu 16*7(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm6 , %xmm8 + movdqa %xmm7 , %xmm13 + pclmulqdq $0x0 , %xmm10, %xmm6 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0 , %xmm10, %xmm7 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm6 + xorps %xmm8 , %xmm6 + pxor %xmm12, %xmm7 + xorps %xmm13, %xmm7 + + sub $128, arg3 + + # check if there is another 64B in the buffer to be able to fold + jge _fold_64_B_loop + ################################################################## + + + add $128, arg2 + # at this point, the buffer pointer is pointing at the last y Bytes + # of the buffer the 64B of folded data is in 4 of the xmm + # registers: xmm0, xmm1, xmm2, xmm3 + + + # fold the 8 xmm registers to 1 xmm register with different constants + + movdqa rk9(%rip), %xmm10 + movdqa %xmm0, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm0 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm0, %xmm7 + + movdqa rk11(%rip), %xmm10 + movdqa %xmm1, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm1 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm1, %xmm7 + + movdqa rk13(%rip), %xmm10 + movdqa %xmm2, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm2 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm2, %xmm7 + + movdqa rk15(%rip), %xmm10 + movdqa %xmm3, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm3 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm3, %xmm7 + + movdqa rk17(%rip), %xmm10 + movdqa %xmm4, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm4 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm4, %xmm7 + + movdqa rk19(%rip), %xmm10 + movdqa %xmm5, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm5 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm5, %xmm7 + + movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2 + #imm value of pclmulqdq instruction + #will determine which constant to use + movdqa %xmm6, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm6 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm6, %xmm7 + + + # instead of 64, we add 48 to the loop counter to save 1 instruction + # from the loop instead of a cmp instruction, we use the negative + # flag with the jl instruction + add $128-16, arg3 + jl _final_reduction_for_128 + + # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 + # and the rest is in memory. We can fold 16 bytes at a time if y>=16 + # continue folding 16B at a time + +_16B_reduction_loop: + movdqa %xmm7, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm7 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + movdqu (arg2), %xmm0 + pshufb %xmm11, %xmm0 + pxor %xmm0 , %xmm7 + add $16, arg2 + sub $16, arg3 + # instead of a cmp instruction, we utilize the flags with the + # jge instruction equivalent of: cmp arg3, 16-16 + # check if there is any more 16B in the buffer to be able to fold + jge _16B_reduction_loop + + #now we have 16+z bytes left to reduce, where 0<= z < 16. + #first, we reduce the data in the xmm7 register + + +_final_reduction_for_128: + # check if any more data to fold. If not, compute the CRC of + # the final 128 bits + add $16, arg3 + je _128_done + + # here we are getting data that is less than 16 bytes. + # since we know that there was data before the pointer, we can + # offset the input pointer before the actual point, to receive + # exactly 16 bytes. after that the registers need to be adjusted. +_get_last_two_xmms: + movdqa %xmm7, %xmm2 + + movdqu -16(arg2, arg3), %xmm1 + pshufb %xmm11, %xmm1 + + # get rid of the extra data that was loaded before + # load the shift constant + lea pshufb_shf_table+16(%rip), %rax + sub arg3, %rax + movdqu (%rax), %xmm0 + + # shift xmm2 to the left by arg3 bytes + pshufb %xmm0, %xmm2 + + # shift xmm7 to the right by 16-arg3 bytes + pxor mask1(%rip), %xmm0 + pshufb %xmm0, %xmm7 + pblendvb %xmm2, %xmm1 #xmm0 is implicit + + # fold 16 Bytes + movdqa %xmm1, %xmm2 + movdqa %xmm7, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm7 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm2, %xmm7 + +_128_done: + # compute crc of a 128-bit value + movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10 + movdqa %xmm7, %xmm0 + + #64b fold + pclmulqdq $0x1, %xmm10, %xmm7 + pslldq $8 , %xmm0 + pxor %xmm0, %xmm7 + + #32b fold + movdqa %xmm7, %xmm0 + + pand mask2(%rip), %xmm0 + + psrldq $12, %xmm7 + pclmulqdq $0x10, %xmm10, %xmm7 + pxor %xmm0, %xmm7 + + #barrett reduction +_barrett: + movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10 + movdqa %xmm7, %xmm0 + pclmulqdq $0x01, %xmm10, %xmm7 + pslldq $4, %xmm7 + pclmulqdq $0x11, %xmm10, %xmm7 + + pslldq $4, %xmm7 + pxor %xmm0, %xmm7 + pextrd $1, %xmm7, %eax + +_cleanup: + # scale the result back to 16 bits + shr $16, %eax + mov %rcx, %rsp + ret + +######################################################################## + +.align 16 +_less_than_128: + + # check if there is enough buffer to be able to fold 16B at a time + cmp $32, arg3 + jl _less_than_32 + movdqa SHUF_MASK(%rip), %xmm11 + + # now if there is, load the constants + movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 + + movd arg1_low32, %xmm0 # get the initial crc value + pslldq $12, %xmm0 # align it to its correct place + movdqu (arg2), %xmm7 # load the plaintext + pshufb %xmm11, %xmm7 # byte-reflect the plaintext + pxor %xmm0, %xmm7 + + + # update the buffer pointer + add $16, arg2 + + # update the counter. subtract 32 instead of 16 to save one + # instruction from the loop + sub $32, arg3 + + jmp _16B_reduction_loop + + +.align 16 +_less_than_32: + # mov initial crc to the return value. this is necessary for + # zero-length buffers. + mov arg1_low32, %eax + test arg3, arg3 + je _cleanup + + movdqa SHUF_MASK(%rip), %xmm11 + + movd arg1_low32, %xmm0 # get the initial crc value + pslldq $12, %xmm0 # align it to its correct place + + cmp $16, arg3 + je _exact_16_left + jl _less_than_16_left + + movdqu (arg2), %xmm7 # load the plaintext + pshufb %xmm11, %xmm7 # byte-reflect the plaintext + pxor %xmm0 , %xmm7 # xor the initial crc value + add $16, arg2 + sub $16, arg3 + movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 + jmp _get_last_two_xmms + + +.align 16 +_less_than_16_left: + # use stack space to load data less than 16 bytes, zero-out + # the 16B in memory first. + + pxor %xmm1, %xmm1 + mov %rsp, %r11 + movdqa %xmm1, (%r11) + + cmp $4, arg3 + jl _only_less_than_4 + + # backup the counter value + mov arg3, %r9 + cmp $8, arg3 + jl _less_than_8_left + + # load 8 Bytes + mov (arg2), %rax + mov %rax, (%r11) + add $8, %r11 + sub $8, arg3 + add $8, arg2 +_less_than_8_left: + + cmp $4, arg3 + jl _less_than_4_left + + # load 4 Bytes + mov (arg2), %eax + mov %eax, (%r11) + add $4, %r11 + sub $4, arg3 + add $4, arg2 +_less_than_4_left: + + cmp $2, arg3 + jl _less_than_2_left + + # load 2 Bytes + mov (arg2), %ax + mov %ax, (%r11) + add $2, %r11 + sub $2, arg3 + add $2, arg2 +_less_than_2_left: + cmp $1, arg3 + jl _zero_left + + # load 1 Byte + mov (arg2), %al + mov %al, (%r11) +_zero_left: + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + # shl r9, 4 + lea pshufb_shf_table+16(%rip), %rax + sub %r9, %rax + movdqu (%rax), %xmm0 + pxor mask1(%rip), %xmm0 + + pshufb %xmm0, %xmm7 + jmp _128_done + +.align 16 +_exact_16_left: + movdqu (arg2), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + jmp _128_done + +_only_less_than_4: + cmp $3, arg3 + jl _only_less_than_3 + + # load 3 Bytes + mov (arg2), %al + mov %al, (%r11) + + mov 1(arg2), %al + mov %al, 1(%r11) + + mov 2(arg2), %al + mov %al, 2(%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $5, %xmm7 + + jmp _barrett +_only_less_than_3: + cmp $2, arg3 + jl _only_less_than_2 + + # load 2 Bytes + mov (arg2), %al + mov %al, (%r11) + + mov 1(arg2), %al + mov %al, 1(%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $6, %xmm7 + + jmp _barrett +_only_less_than_2: + + # load 1 Byte + mov (arg2), %al + mov %al, (%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $7, %xmm7 + + jmp _barrett + +ENDPROC(crc_t10dif_pcl) + +.data + +# precomputed constants +# these constants are precomputed from the poly: +# 0x8bb70000 (0x8bb7 scaled to 32 bits) +.align 16 +# Q = 0x18BB70000 +# rk1 = 2^(32*3) mod Q << 32 +# rk2 = 2^(32*5) mod Q << 32 +# rk3 = 2^(32*15) mod Q << 32 +# rk4 = 2^(32*17) mod Q << 32 +# rk5 = 2^(32*3) mod Q << 32 +# rk6 = 2^(32*2) mod Q << 32 +# rk7 = floor(2^64/Q) +# rk8 = Q +rk1: +.quad 0x2d56000000000000 +rk2: +.quad 0x06df000000000000 +rk3: +.quad 0x9d9d000000000000 +rk4: +.quad 0x7cf5000000000000 +rk5: +.quad 0x2d56000000000000 +rk6: +.quad 0x1368000000000000 +rk7: +.quad 0x00000001f65a57f8 +rk8: +.quad 0x000000018bb70000 + +rk9: +.quad 0xceae000000000000 +rk10: +.quad 0xbfd6000000000000 +rk11: +.quad 0x1e16000000000000 +rk12: +.quad 0x713c000000000000 +rk13: +.quad 0xf7f9000000000000 +rk14: +.quad 0x80a6000000000000 +rk15: +.quad 0x044c000000000000 +rk16: +.quad 0xe658000000000000 +rk17: +.quad 0xad18000000000000 +rk18: +.quad 0xa497000000000000 +rk19: +.quad 0x6ee3000000000000 +rk20: +.quad 0xe7b5000000000000 + + + +mask1: +.octa 0x80808080808080808080808080808080 +mask2: +.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF + +SHUF_MASK: +.octa 0x000102030405060708090A0B0C0D0E0F + +pshufb_shf_table: +# use these values for shift constants for the pshufb instruction +# different alignments result in values as shown: +# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 +# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 +# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 +# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 +# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 +# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 +# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 +# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 +# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 +# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 +# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 +# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 +# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 +# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 +# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 +.octa 0x8f8e8d8c8b8a89888786858483828100 +.octa 0x000e0d0c0b0a09080706050403020100 diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c new file mode 100644 index 00000000000..7845d7fd54c --- /dev/null +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -0,0 +1,151 @@ +/* + * Cryptographic API. + * + * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions + * + * Copyright (C) 2013 Intel Corporation + * Author: Tim Chen <tim.c.chen@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/types.h> +#include <linux/module.h> +#include <linux/crc-t10dif.h> +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <asm/i387.h> +#include <asm/cpufeature.h> +#include <asm/cpu_device_id.h> + +asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf, + size_t len); + +struct chksum_desc_ctx { + __u16 crc; +}; + +/* + * Steps through buffer one byte at at time, calculates reflected + * crc using table. + */ + +static int chksum_init(struct shash_desc *desc) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = 0; + + return 0; +} + +static int chksum_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + if (irq_fpu_usable()) { + kernel_fpu_begin(); + ctx->crc = crc_t10dif_pcl(ctx->crc, data, length); + kernel_fpu_end(); + } else + ctx->crc = crc_t10dif_generic(ctx->crc, data, length); + return 0; +} + +static int chksum_final(struct shash_desc *desc, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + *(__u16 *)out = ctx->crc; + return 0; +} + +static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + if (irq_fpu_usable()) { + kernel_fpu_begin(); + *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len); + kernel_fpu_end(); + } else + *(__u16 *)out = crc_t10dif_generic(*crcp, data, len); + return 0; +} + +static int chksum_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, len, out); +} + +static int chksum_digest(struct shash_desc *desc, const u8 *data, + unsigned int length, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, length, out); +} + +static struct shash_alg alg = { + .digestsize = CRC_T10DIF_DIGEST_SIZE, + .init = chksum_init, + .update = chksum_update, + .final = chksum_final, + .finup = chksum_finup, + .digest = chksum_digest, + .descsize = sizeof(struct chksum_desc_ctx), + .base = { + .cra_name = "crct10dif", + .cra_driver_name = "crct10dif-pclmul", + .cra_priority = 200, + .cra_blocksize = CRC_T10DIF_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static const struct x86_cpu_id crct10dif_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id); + +static int __init crct10dif_intel_mod_init(void) +{ + if (!x86_match_cpu(crct10dif_cpu_id)) + return -ENODEV; + + return crypto_register_shash(&alg); +} + +static void __exit crct10dif_intel_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(crct10dif_intel_mod_init); +module_exit(crct10dif_intel_mod_fini); + +MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>"); +MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ."); +MODULE_LICENSE("GPL"); + +MODULE_ALIAS("crct10dif"); +MODULE_ALIAS("crct10dif-pclmul"); diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index bccfca68430..665a730307f 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -457,7 +457,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp); + compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 474dc1b59f7..4299eb05023 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -452,7 +452,7 @@ ia32_badsys: CFI_ENDPROC - .macro PTREGSCALL label, func, arg + .macro PTREGSCALL label, func ALIGN GLOBAL(\label) leaq \func(%rip),%rax diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 2dfac58f3b1..b1977bad543 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -86,6 +86,7 @@ extern int acpi_pci_disabled; extern int acpi_skip_timer_override; extern int acpi_use_timer_override; extern int acpi_fix_pin2_polarity; +extern int acpi_disable_cmcff; extern u8 acpi_sci_flags; extern int acpi_sci_override_gsi; @@ -168,6 +169,7 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf) #define acpi_lapic 0 #define acpi_ioapic 0 +#define acpi_disable_cmcff 0 static inline void acpi_noirq_set(void) { } static inline void acpi_disable_pci(void) { } static inline void disable_acpi(void) { } diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 58ed6d96a6a..0a3f9c9f98d 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -5,6 +5,7 @@ #include <linux/stddef.h> #include <linux/stringify.h> #include <asm/asm.h> +#include <asm/ptrace.h> /* * Alternative inline assembly for SMP. @@ -220,20 +221,11 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); * no thread can be preempted in the instructions being modified (no iret to an * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. - * More care must be taken when modifying code in the SMP case because of - * Intel's errata. text_poke_smp() takes care that errata, but still - * doesn't support NMI/MCE handler code modifying. * On the local CPU you need to be protected again NMI or MCE handlers seeing an * inconsistent instruction while you patch. */ -struct text_poke_param { - void *addr; - const void *opcode; - size_t len; -}; - extern void *text_poke(void *addr, const void *opcode, size_t len); -extern void *text_poke_smp(void *addr, const void *opcode, size_t len); -extern void text_poke_smp_batch(struct text_poke_param *params, int n); +extern int poke_int3_handler(struct pt_regs *regs); +extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index f8119b582c3..1d2091a226b 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -715,4 +715,6 @@ static inline void exiting_ack_irq(void) ack_APIC_irq(); } +extern void ioapic_zap_locks(void); + #endif /* _ASM_X86_APIC_H */ diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 1c2d247f65c..4582e8e1cd1 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -3,21 +3,25 @@ #ifdef __ASSEMBLY__ # define __ASM_FORM(x) x +# define __ASM_FORM_RAW(x) x # define __ASM_FORM_COMMA(x) x, #else # define __ASM_FORM(x) " " #x " " +# define __ASM_FORM_RAW(x) #x # define __ASM_FORM_COMMA(x) " " #x "," #endif #ifdef CONFIG_X86_32 # define __ASM_SEL(a,b) __ASM_FORM(a) +# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) #else # define __ASM_SEL(a,b) __ASM_FORM(b) +# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) #endif #define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \ inst##q##__VA_ARGS__) -#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg) +#define __ASM_REG(reg) __ASM_SEL_RAW(e##reg, r##reg) #define _ASM_PTR __ASM_SEL(.long, .quad) #define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8) diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 722aa3b0462..da31c8b8a92 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -6,6 +6,7 @@ #include <asm/processor.h> #include <asm/alternative.h> #include <asm/cmpxchg.h> +#include <asm/rmwcc.h> /* * Atomic operations that C can't guarantee us. Useful for @@ -76,12 +77,7 @@ static inline void atomic_sub(int i, atomic_t *v) */ static inline int atomic_sub_and_test(int i, atomic_t *v) { - unsigned char c; - - asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" - : "+m" (v->counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; + GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, i, "%0", "e"); } /** @@ -118,12 +114,7 @@ static inline void atomic_dec(atomic_t *v) */ static inline int atomic_dec_and_test(atomic_t *v) { - unsigned char c; - - asm volatile(LOCK_PREFIX "decl %0; sete %1" - : "+m" (v->counter), "=qm" (c) - : : "memory"); - return c != 0; + GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e"); } /** @@ -136,12 +127,7 @@ static inline int atomic_dec_and_test(atomic_t *v) */ static inline int atomic_inc_and_test(atomic_t *v) { - unsigned char c; - - asm volatile(LOCK_PREFIX "incl %0; sete %1" - : "+m" (v->counter), "=qm" (c) - : : "memory"); - return c != 0; + GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e"); } /** @@ -155,12 +141,7 @@ static inline int atomic_inc_and_test(atomic_t *v) */ static inline int atomic_add_negative(int i, atomic_t *v) { - unsigned char c; - - asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" - : "+m" (v->counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; + GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, i, "%0", "s"); } /** diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 0e1cbfc8ee0..3f065c985ae 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -72,12 +72,7 @@ static inline void atomic64_sub(long i, atomic64_t *v) */ static inline int atomic64_sub_and_test(long i, atomic64_t *v) { - unsigned char c; - - asm volatile(LOCK_PREFIX "subq %2,%0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "er" (i), "m" (v->counter) : "memory"); - return c; + GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, i, "%0", "e"); } /** @@ -116,12 +111,7 @@ static inline void atomic64_dec(atomic64_t *v) */ static inline int atomic64_dec_and_test(atomic64_t *v) { - unsigned char c; - - asm volatile(LOCK_PREFIX "decq %0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "m" (v->counter) : "memory"); - return c != 0; + GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", "e"); } /** @@ -134,12 +124,7 @@ static inline int atomic64_dec_and_test(atomic64_t *v) */ static inline int atomic64_inc_and_test(atomic64_t *v) { - unsigned char c; - - asm volatile(LOCK_PREFIX "incq %0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "m" (v->counter) : "memory"); - return c != 0; + GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", "e"); } /** @@ -153,12 +138,7 @@ static inline int atomic64_inc_and_test(atomic64_t *v) */ static inline int atomic64_add_negative(long i, atomic64_t *v) { - unsigned char c; - - asm volatile(LOCK_PREFIX "addq %2,%0; sets %1" - : "=m" (v->counter), "=qm" (c) - : "er" (i), "m" (v->counter) : "memory"); - return c; + GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, i, "%0", "s"); } /** diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 6dfd0195bb5..6d76d093598 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -14,6 +14,15 @@ #include <linux/compiler.h> #include <asm/alternative.h> +#include <asm/rmwcc.h> + +#if BITS_PER_LONG == 32 +# define _BITOPS_LONG_SHIFT 5 +#elif BITS_PER_LONG == 64 +# define _BITOPS_LONG_SHIFT 6 +#else +# error "Unexpected BITS_PER_LONG" +#endif #define BIT_64(n) (U64_C(1) << (n)) @@ -59,7 +68,7 @@ * restricted to acting on a single-word quantity. */ static __always_inline void -set_bit(unsigned int nr, volatile unsigned long *addr) +set_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "orb %1,%0" @@ -81,7 +90,7 @@ set_bit(unsigned int nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __set_bit(int nr, volatile unsigned long *addr) +static inline void __set_bit(long nr, volatile unsigned long *addr) { asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); } @@ -97,7 +106,7 @@ static inline void __set_bit(int nr, volatile unsigned long *addr) * in order to ensure changes are visible on other processors. */ static __always_inline void -clear_bit(int nr, volatile unsigned long *addr) +clear_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "andb %1,%0" @@ -118,13 +127,13 @@ clear_bit(int nr, volatile unsigned long *addr) * clear_bit() is atomic and implies release semantics before the memory * operation. It can be used for an unlock. */ -static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr) +static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); clear_bit(nr, addr); } -static inline void __clear_bit(int nr, volatile unsigned long *addr) +static inline void __clear_bit(long nr, volatile unsigned long *addr) { asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); } @@ -141,7 +150,7 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr) * No memory barrier is required here, because x86 cannot reorder stores past * older loads. Same principle as spin_unlock. */ -static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr) +static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); __clear_bit(nr, addr); @@ -159,7 +168,7 @@ static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __change_bit(int nr, volatile unsigned long *addr) +static inline void __change_bit(long nr, volatile unsigned long *addr) { asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); } @@ -173,7 +182,7 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void change_bit(int nr, volatile unsigned long *addr) +static inline void change_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "xorb %1,%0" @@ -194,14 +203,9 @@ static inline void change_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_set_bit(int nr, volatile unsigned long *addr) +static inline int test_and_set_bit(long nr, volatile unsigned long *addr) { - int oldbit; - - asm volatile(LOCK_PREFIX "bts %2,%1\n\t" - "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); - - return oldbit; + GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, nr, "%0", "c"); } /** @@ -212,7 +216,7 @@ static inline int test_and_set_bit(int nr, volatile unsigned long *addr) * This is the same as test_and_set_bit on x86. */ static __always_inline int -test_and_set_bit_lock(int nr, volatile unsigned long *addr) +test_and_set_bit_lock(long nr, volatile unsigned long *addr) { return test_and_set_bit(nr, addr); } @@ -226,7 +230,7 @@ test_and_set_bit_lock(int nr, volatile unsigned long *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) +static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -245,15 +249,9 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) +static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) { - int oldbit; - - asm volatile(LOCK_PREFIX "btr %2,%1\n\t" - "sbb %0,%0" - : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); - - return oldbit; + GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, nr, "%0", "c"); } /** @@ -272,7 +270,7 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) * accessed from a hypervisor on the same CPU if running in a VM: don't change * this without also updating arch/x86/kernel/kvm.c */ -static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) +static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -284,7 +282,7 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) } /* WARNING: non atomic and it can be reordered! */ -static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) +static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -304,24 +302,18 @@ static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_change_bit(int nr, volatile unsigned long *addr) +static inline int test_and_change_bit(long nr, volatile unsigned long *addr) { - int oldbit; - - asm volatile(LOCK_PREFIX "btc %2,%1\n\t" - "sbb %0,%0" - : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); - - return oldbit; + GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, nr, "%0", "c"); } -static __always_inline int constant_test_bit(unsigned int nr, const volatile unsigned long *addr) +static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr) { - return ((1UL << (nr % BITS_PER_LONG)) & - (addr[nr / BITS_PER_LONG])) != 0; + return ((1UL << (nr & (BITS_PER_LONG-1))) & + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } -static inline int variable_test_bit(int nr, volatile const unsigned long *addr) +static inline int variable_test_bit(long nr, volatile const unsigned long *addr) { int oldbit; diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 0fa67503391..cb4c73bfeb4 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -48,6 +48,8 @@ For 32-bit we have the following conventions - kernel is built with #include <asm/dwarf2.h> +#ifdef CONFIG_X86_64 + /* * 64-bit system call stack frame layout defines and helpers, * for assembly code: @@ -192,3 +194,51 @@ For 32-bit we have the following conventions - kernel is built with .macro icebp .byte 0xf1 .endm + +#else /* CONFIG_X86_64 */ + +/* + * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These + * are different from the entry_32.S versions in not changing the segment + * registers. So only suitable for in kernel use, not when transitioning + * from or to user space. The resulting stack frame is not a standard + * pt_regs frame. The main use case is calling C code from assembler + * when all the registers need to be preserved. + */ + + .macro SAVE_ALL + pushl_cfi %eax + CFI_REL_OFFSET eax, 0 + pushl_cfi %ebp + CFI_REL_OFFSET ebp, 0 + pushl_cfi %edi + CFI_REL_OFFSET edi, 0 + pushl_cfi %esi + CFI_REL_OFFSET esi, 0 + pushl_cfi %edx + CFI_REL_OFFSET edx, 0 + pushl_cfi %ecx + CFI_REL_OFFSET ecx, 0 + pushl_cfi %ebx + CFI_REL_OFFSET ebx, 0 + .endm + + .macro RESTORE_ALL + popl_cfi %ebx + CFI_RESTORE ebx + popl_cfi %ecx + CFI_RESTORE ecx + popl_cfi %edx + CFI_RESTORE edx + popl_cfi %esi + CFI_RESTORE esi + popl_cfi %edi + CFI_RESTORE edi + popl_cfi %ebp + CFI_RESTORE ebp + popl_cfi %eax + CFI_RESTORE eax + .endm + +#endif /* CONFIG_X86_64 */ + diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index 46fc474fd81..f50de695173 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -49,9 +49,15 @@ static inline __wsum csum_partial_copy_from_user(const void __user *src, int len, __wsum sum, int *err_ptr) { + __wsum ret; + might_sleep(); - return csum_partial_copy_generic((__force void *)src, dst, - len, sum, err_ptr, NULL); + stac(); + ret = csum_partial_copy_generic((__force void *)src, dst, + len, sum, err_ptr, NULL); + clac(); + + return ret; } /* @@ -176,10 +182,16 @@ static inline __wsum csum_and_copy_to_user(const void *src, int len, __wsum sum, int *err_ptr) { + __wsum ret; + might_sleep(); - if (access_ok(VERIFY_WRITE, dst, len)) - return csum_partial_copy_generic(src, (__force void *)dst, - len, sum, NULL, err_ptr); + if (access_ok(VERIFY_WRITE, dst, len)) { + stac(); + ret = csum_partial_copy_generic(src, (__force void *)dst, + len, sum, NULL, err_ptr); + clac(); + return ret; + } if (len) *err_ptr = -EFAULT; diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index 9bfdc41629e..e6fd8a026c7 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -133,7 +133,7 @@ extern __wsum csum_partial(const void *buff, int len, __wsum sum); /* Do not call this directly. Use the wrappers below */ -extern __wsum csum_partial_copy_generic(const void *src, const void *dst, +extern __visible __wsum csum_partial_copy_generic(const void *src, const void *dst, int len, __wsum sum, int *src_err_ptr, int *dst_err_ptr); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 47538a61c91..89270b4318d 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -366,14 +366,15 @@ extern bool __static_cpu_has_safe(u16 bit); */ static __always_inline __pure bool __static_cpu_has(u16 bit) { -#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5 +#ifdef CC_HAVE_ASM_GOTO #ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS + /* * Catch too early usage of this before alternatives * have run. */ - asm goto("1: jmp %l[t_warn]\n" + asm_volatile_goto("1: jmp %l[t_warn]\n" "2:\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" @@ -384,9 +385,10 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) ".previous\n" /* skipping size check since replacement size = 0 */ : : "i" (X86_FEATURE_ALWAYS) : : t_warn); + #endif - asm goto("1: jmp %l[t_no]\n" + asm_volatile_goto("1: jmp %l[t_no]\n" "2:\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" @@ -406,7 +408,9 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) warn_pre_alternatives(); return false; #endif -#else /* GCC_VERSION >= 40500 */ + +#else /* CC_HAVE_ASM_GOTO */ + u8 flag; /* Open-coded due to __stringify() in ALTERNATIVE() */ asm volatile("1: movb $0,%0\n" @@ -427,7 +431,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) ".previous\n" : "=qm" (flag) : "i" (bit)); return flag; -#endif + +#endif /* CC_HAVE_ASM_GOTO */ } #define static_cpu_has(bit) \ @@ -441,14 +446,14 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) static __always_inline __pure bool _static_cpu_has_safe(u16 bit) { -#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5 +#ifdef CC_HAVE_ASM_GOTO /* * We need to spell the jumps to the compiler because, depending on the offset, * the replacement jump can be bigger than the original jump, and this we cannot * have. Thus, we force the jump to the widest, 4-byte, signed relative * offset even though the last would often fit in less bytes. */ - asm goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n" + asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n" "2:\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" /* src offset */ @@ -475,7 +480,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) return false; t_dynamic: return __static_cpu_has_safe(bit); -#else /* GCC_VERSION >= 40500 */ +#else u8 flag; /* Open-coded due to __stringify() in ALTERNATIVE() */ asm volatile("1: movb $2,%0\n" @@ -511,7 +516,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) : "=qm" (flag) : "i" (bit), "i" (X86_FEATURE_ALWAYS)); return (flag == 2 ? __static_cpu_has_safe(bit) : flag); -#endif +#endif /* CC_HAVE_ASM_GOTO */ } #define static_cpu_has_safe(bit) \ diff --git a/arch/x86/include/asm/dma-contiguous.h b/arch/x86/include/asm/dma-contiguous.h index c0924165997..b4b38bacb40 100644 --- a/arch/x86/include/asm/dma-contiguous.h +++ b/arch/x86/include/asm/dma-contiguous.h @@ -4,7 +4,6 @@ #ifdef __KERNEL__ #include <linux/types.h> -#include <asm-generic/dma-contiguous.h> static inline void dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) { } diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index cccd07fa5e3..779c2efe2e9 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -29,7 +29,7 @@ extern void e820_setup_gap(void); extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, unsigned long start_addr, unsigned long long end_addr); struct setup_data; -extern void parse_e820_ext(struct setup_data *data); +extern void parse_e820_ext(u64 phys_addr, u32 data_len); #if defined(CONFIG_X86_64) || \ (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 0062a012504..65c6e6e3a55 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -109,6 +109,8 @@ static inline bool efi_is_native(void) return IS_ENABLED(CONFIG_X86_64) == efi_enabled(EFI_64BIT); } +extern struct console early_efi_console; + #else /* * IF EFI is not configured, have the EFI calls return -ENOSYS. diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index e4ac559c4a2..92b3bae08b7 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -26,56 +26,56 @@ #include <asm/sections.h> /* Interrupt handlers registered during init_IRQ */ -extern void apic_timer_interrupt(void); -extern void x86_platform_ipi(void); -extern void kvm_posted_intr_ipi(void); -extern void error_interrupt(void); -extern void irq_work_interrupt(void); - -extern void spurious_interrupt(void); -extern void thermal_interrupt(void); -extern void reschedule_interrupt(void); - -extern void invalidate_interrupt(void); -extern void invalidate_interrupt0(void); -extern void invalidate_interrupt1(void); -extern void invalidate_interrupt2(void); -extern void invalidate_interrupt3(void); -extern void invalidate_interrupt4(void); -extern void invalidate_interrupt5(void); -extern void invalidate_interrupt6(void); -extern void invalidate_interrupt7(void); -extern void invalidate_interrupt8(void); -extern void invalidate_interrupt9(void); -extern void invalidate_interrupt10(void); -extern void invalidate_interrupt11(void); -extern void invalidate_interrupt12(void); -extern void invalidate_interrupt13(void); -extern void invalidate_interrupt14(void); -extern void invalidate_interrupt15(void); -extern void invalidate_interrupt16(void); -extern void invalidate_interrupt17(void); -extern void invalidate_interrupt18(void); -extern void invalidate_interrupt19(void); -extern void invalidate_interrupt20(void); -extern void invalidate_interrupt21(void); -extern void invalidate_interrupt22(void); -extern void invalidate_interrupt23(void); -extern void invalidate_interrupt24(void); -extern void invalidate_interrupt25(void); -extern void invalidate_interrupt26(void); -extern void invalidate_interrupt27(void); -extern void invalidate_interrupt28(void); -extern void invalidate_interrupt29(void); -extern void invalidate_interrupt30(void); -extern void invalidate_interrupt31(void); - -extern void irq_move_cleanup_interrupt(void); -extern void reboot_interrupt(void); -extern void threshold_interrupt(void); - -extern void call_function_interrupt(void); -extern void call_function_single_interrupt(void); +extern asmlinkage void apic_timer_interrupt(void); +extern asmlinkage void x86_platform_ipi(void); +extern asmlinkage void kvm_posted_intr_ipi(void); +extern asmlinkage void error_interrupt(void); +extern asmlinkage void irq_work_interrupt(void); + +extern asmlinkage void spurious_interrupt(void); +extern asmlinkage void thermal_interrupt(void); +extern asmlinkage void reschedule_interrupt(void); + +extern asmlinkage void invalidate_interrupt(void); +extern asmlinkage void invalidate_interrupt0(void); +extern asmlinkage void invalidate_interrupt1(void); +extern asmlinkage void invalidate_interrupt2(void); +extern asmlinkage void invalidate_interrupt3(void); +extern asmlinkage void invalidate_interrupt4(void); +extern asmlinkage void invalidate_interrupt5(void); +extern asmlinkage void invalidate_interrupt6(void); +extern asmlinkage void invalidate_interrupt7(void); +extern asmlinkage void invalidate_interrupt8(void); +extern asmlinkage void invalidate_interrupt9(void); +extern asmlinkage void invalidate_interrupt10(void); +extern asmlinkage void invalidate_interrupt11(void); +extern asmlinkage void invalidate_interrupt12(void); +extern asmlinkage void invalidate_interrupt13(void); +extern asmlinkage void invalidate_interrupt14(void); +extern asmlinkage void invalidate_interrupt15(void); +extern asmlinkage void invalidate_interrupt16(void); +extern asmlinkage void invalidate_interrupt17(void); +extern asmlinkage void invalidate_interrupt18(void); +extern asmlinkage void invalidate_interrupt19(void); +extern asmlinkage void invalidate_interrupt20(void); +extern asmlinkage void invalidate_interrupt21(void); +extern asmlinkage void invalidate_interrupt22(void); +extern asmlinkage void invalidate_interrupt23(void); +extern asmlinkage void invalidate_interrupt24(void); +extern asmlinkage void invalidate_interrupt25(void); +extern asmlinkage void invalidate_interrupt26(void); +extern asmlinkage void invalidate_interrupt27(void); +extern asmlinkage void invalidate_interrupt28(void); +extern asmlinkage void invalidate_interrupt29(void); +extern asmlinkage void invalidate_interrupt30(void); +extern asmlinkage void invalidate_interrupt31(void); + +extern asmlinkage void irq_move_cleanup_interrupt(void); +extern asmlinkage void reboot_interrupt(void); +extern asmlinkage void threshold_interrupt(void); + +extern asmlinkage void call_function_interrupt(void); +extern asmlinkage void call_function_single_interrupt(void); #ifdef CONFIG_TRACING /* Interrupt handlers registered during init_IRQ */ @@ -172,22 +172,18 @@ extern atomic_t irq_mis_count; extern void eisa_set_level_irq(unsigned int irq); /* SMP */ -extern void smp_apic_timer_interrupt(struct pt_regs *); -extern void smp_spurious_interrupt(struct pt_regs *); -extern void smp_x86_platform_ipi(struct pt_regs *); -extern void smp_error_interrupt(struct pt_regs *); +extern __visible void smp_apic_timer_interrupt(struct pt_regs *); +extern __visible void smp_spurious_interrupt(struct pt_regs *); +extern __visible void smp_x86_platform_ipi(struct pt_regs *); +extern __visible void smp_error_interrupt(struct pt_regs *); #ifdef CONFIG_X86_IO_APIC extern asmlinkage void smp_irq_move_cleanup_interrupt(void); #endif #ifdef CONFIG_SMP -extern void smp_reschedule_interrupt(struct pt_regs *); -extern void smp_call_function_interrupt(struct pt_regs *); -extern void smp_call_function_single_interrupt(struct pt_regs *); -#ifdef CONFIG_X86_32 -extern void smp_invalidate_interrupt(struct pt_regs *); -#else -extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); -#endif +extern __visible void smp_reschedule_interrupt(struct pt_regs *); +extern __visible void smp_call_function_interrupt(struct pt_regs *); +extern __visible void smp_call_function_single_interrupt(struct pt_regs *); +extern __visible void smp_invalidate_interrupt(struct pt_regs *); #endif extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 2d4b5e6107c..e42f758a0fb 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -33,7 +33,7 @@ struct hypervisor_x86 { const char *name; /* Detection routine */ - bool (*detect)(void); + uint32_t (*detect)(void); /* Adjust CPU feature bits (run once per CPU) */ void (*set_cpu_features)(struct cpuinfo_x86 *); diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h new file mode 100644 index 00000000000..459769d3926 --- /dev/null +++ b/arch/x86/include/asm/intel-mid.h @@ -0,0 +1,113 @@ +/* + * intel-mid.h: Intel MID specific setup code + * + * (C) Copyright 2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#ifndef _ASM_X86_INTEL_MID_H +#define _ASM_X86_INTEL_MID_H + +#include <linux/sfi.h> +#include <linux/platform_device.h> + +extern int intel_mid_pci_init(void); +extern int get_gpio_by_name(const char *name); +extern void intel_scu_device_register(struct platform_device *pdev); +extern int __init sfi_parse_mrtc(struct sfi_table_header *table); +extern int __init sfi_parse_mtmr(struct sfi_table_header *table); +extern int sfi_mrtc_num; +extern struct sfi_rtc_table_entry sfi_mrtc_array[]; + +/* + * Here defines the array of devices platform data that IAFW would export + * through SFI "DEVS" table, we use name and type to match the device and + * its platform data. + */ +struct devs_id { + char name[SFI_NAME_LEN + 1]; + u8 type; + u8 delay; + void *(*get_platform_data)(void *info); + /* Custom handler for devices */ + void (*device_handler)(struct sfi_device_table_entry *pentry, + struct devs_id *dev); +}; + +#define sfi_device(i) \ + static const struct devs_id *const __intel_mid_sfi_##i##_dev __used \ + __attribute__((__section__(".x86_intel_mid_dev.init"))) = &i + +/* + * Medfield is the follow-up of Moorestown, it combines two chip solution into + * one. Other than that it also added always-on and constant tsc and lapic + * timers. Medfield is the platform name, and the chip name is called Penwell + * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be + * identified via MSRs. + */ +enum intel_mid_cpu_type { + /* 1 was Moorestown */ + INTEL_MID_CPU_CHIP_PENWELL = 2, +}; + +extern enum intel_mid_cpu_type __intel_mid_cpu_chip; + +#ifdef CONFIG_X86_INTEL_MID + +static inline enum intel_mid_cpu_type intel_mid_identify_cpu(void) +{ + return __intel_mid_cpu_chip; +} + +static inline bool intel_mid_has_msic(void) +{ + return (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_PENWELL); +} + +#else /* !CONFIG_X86_INTEL_MID */ + +#define intel_mid_identify_cpu() (0) +#define intel_mid_has_msic() (0) + +#endif /* !CONFIG_X86_INTEL_MID */ + +enum intel_mid_timer_options { + INTEL_MID_TIMER_DEFAULT, + INTEL_MID_TIMER_APBT_ONLY, + INTEL_MID_TIMER_LAPIC_APBT, +}; + +extern enum intel_mid_timer_options intel_mid_timer_options; + +/* + * Penwell uses spread spectrum clock, so the freq number is not exactly + * the same as reported by MSR based on SDM. + */ +#define PENWELL_FSB_FREQ_83SKU 83200 +#define PENWELL_FSB_FREQ_100SKU 99840 + +#define SFI_MTMR_MAX_NUM 8 +#define SFI_MRTC_MAX 8 + +extern struct console early_mrst_console; +extern void mrst_early_console_init(void); + +extern struct console early_hsu_console; +extern void hsu_early_console_init(const char *); + +extern void intel_scu_devices_create(void); +extern void intel_scu_devices_destroy(void); + +/* VRTC timer */ +#define MRST_VRTC_MAP_SZ (1024) +/*#define MRST_VRTC_PGOFFSET (0xc00) */ + +extern void intel_mid_rtc_init(void); + +/* the offset for the mapping of global gpio pin to irq */ +#define INTEL_MID_IRQ_OFFSET 0x100 + +#endif /* _ASM_X86_INTEL_MID_H */ diff --git a/arch/x86/include/asm/mrst-vrtc.h b/arch/x86/include/asm/intel_mid_vrtc.h index 1e69a75412a..86ff4685c40 100644 --- a/arch/x86/include/asm/mrst-vrtc.h +++ b/arch/x86/include/asm/intel_mid_vrtc.h @@ -1,5 +1,5 @@ -#ifndef _MRST_VRTC_H -#define _MRST_VRTC_H +#ifndef _INTEL_MID_VRTC_H +#define _INTEL_MID_VRTC_H extern unsigned char vrtc_cmos_read(unsigned char reg); extern void vrtc_cmos_write(unsigned char val, unsigned char reg); diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 57873beb329..0ea10f27d61 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -33,7 +33,7 @@ extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); -extern unsigned int do_IRQ(struct pt_regs *regs); +extern __visible unsigned int do_IRQ(struct pt_regs *regs); /* Interrupt vector management */ extern DECLARE_BITMAP(used_vectors, NR_VECTORS); diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 3a16c1483b4..6a2cefb4395 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -3,18 +3,23 @@ #ifdef __KERNEL__ +#include <linux/stringify.h> #include <linux/types.h> #include <asm/nops.h> #include <asm/asm.h> #define JUMP_LABEL_NOP_SIZE 5 -#define STATIC_KEY_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" +#ifdef CONFIG_X86_64 +# define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC +#else +# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC +#endif static __always_inline bool arch_static_branch(struct static_key *key) { - asm goto("1:" - STATIC_KEY_INITIAL_NOP + asm_volatile_goto("1:" + ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t" ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" _ASM_PTR "1b, %l[l_yes], %c0 \n\t" diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 5a6d2873f80..9454c167629 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -49,10 +49,10 @@ typedef u8 kprobe_opcode_t; #define flush_insn_slot(p) do { } while (0) /* optinsn template addresses */ -extern kprobe_opcode_t optprobe_template_entry; -extern kprobe_opcode_t optprobe_template_val; -extern kprobe_opcode_t optprobe_template_call; -extern kprobe_opcode_t optprobe_template_end; +extern __visible kprobe_opcode_t optprobe_template_entry; +extern __visible kprobe_opcode_t optprobe_template_val; +extern __visible kprobe_opcode_t optprobe_template_call; +extern __visible kprobe_opcode_t optprobe_template_end; #define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) #define MAX_OPTINSN_SIZE \ (((unsigned long)&optprobe_template_end - \ @@ -62,7 +62,7 @@ extern kprobe_opcode_t optprobe_template_end; extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); -void kretprobe_trampoline(void); +asmlinkage void kretprobe_trampoline(void); /* Architecture specific copy of original instruction*/ struct arch_specific_insn { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f87f7fcefa0..c76ff74a98f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -286,6 +286,7 @@ struct kvm_mmu { u64 *pae_root; u64 *lm_root; u64 rsvd_bits_mask[2][4]; + u64 bad_mt_xwr; /* * Bitmap: bit set = last pte in walk @@ -323,6 +324,7 @@ struct kvm_pmu { u64 global_ovf_ctrl; u64 counter_bitmask[2]; u64 global_ctrl_mask; + u64 reserved_bits; u8 version; struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; @@ -511,6 +513,14 @@ struct kvm_vcpu_arch { * instruction. */ bool write_fault_to_shadow_pgtable; + + /* set at EPT violation at this point */ + unsigned long exit_qualification; + + /* pv related host specific info */ + struct { + bool pv_unhalted; + } pv; }; struct kvm_lpage_info { @@ -802,8 +812,8 @@ extern u32 kvm_min_guest_tsc_khz; extern u32 kvm_max_guest_tsc_khz; enum emulation_result { - EMULATE_DONE, /* no further processing */ - EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ + EMULATE_DONE, /* no further processing */ + EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */ EMULATE_FAIL, /* can't emulate this instruction */ }; diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 695399f2d5e..1df11590975 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -85,26 +85,20 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, return ret; } -static inline bool kvm_para_available(void) +static inline uint32_t kvm_cpuid_base(void) { - unsigned int eax, ebx, ecx, edx; - char signature[13]; - if (boot_cpu_data.cpuid_level < 0) - return false; /* So we don't blow up on old processors */ + return 0; /* So we don't blow up on old processors */ - if (cpu_has_hypervisor) { - cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); - memcpy(signature + 0, &ebx, 4); - memcpy(signature + 4, &ecx, 4); - memcpy(signature + 8, &edx, 4); - signature[12] = 0; + if (cpu_has_hypervisor) + return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); - if (strcmp(signature, "KVMKVMKVM") == 0) - return true; - } + return 0; +} - return false; +static inline bool kvm_para_available(void) +{ + return kvm_cpuid_base() != 0; } static inline unsigned int kvm_arch_para_features(void) @@ -118,10 +112,20 @@ void kvm_async_pf_task_wait(u32 token); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); extern void kvm_disable_steal_time(void); -#else -#define kvm_guest_init() do { } while (0) + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +void __init kvm_spinlock_init(void); +#else /* !CONFIG_PARAVIRT_SPINLOCKS */ +static inline void kvm_spinlock_init(void) +{ +} +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#else /* CONFIG_KVM_GUEST */ +#define kvm_guest_init() do {} while (0) #define kvm_async_pf_task_wait(T) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) + static inline u32 kvm_read_and_reset_pf_reason(void) { return 0; diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 2d89e3980cb..5b23e605e70 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -52,12 +52,7 @@ static inline void local_sub(long i, local_t *l) */ static inline int local_sub_and_test(long i, local_t *l) { - unsigned char c; - - asm volatile(_ASM_SUB "%2,%0; sete %1" - : "+m" (l->a.counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; + GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, i, "%0", "e"); } /** @@ -70,12 +65,7 @@ static inline int local_sub_and_test(long i, local_t *l) */ static inline int local_dec_and_test(local_t *l) { - unsigned char c; - - asm volatile(_ASM_DEC "%0; sete %1" - : "+m" (l->a.counter), "=qm" (c) - : : "memory"); - return c != 0; + GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", "e"); } /** @@ -88,12 +78,7 @@ static inline int local_dec_and_test(local_t *l) */ static inline int local_inc_and_test(local_t *l) { - unsigned char c; - - asm volatile(_ASM_INC "%0; sete %1" - : "+m" (l->a.counter), "=qm" (c) - : : "memory"); - return c != 0; + GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", "e"); } /** @@ -107,12 +92,7 @@ static inline int local_inc_and_test(local_t *l) */ static inline int local_add_negative(long i, local_t *l) { - unsigned char c; - - asm volatile(_ASM_ADD "%2,%0; sets %1" - : "+m" (l->a.counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; + GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, i, "%0", "s"); } /** diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 29e3093bbd2..c696a868756 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -16,6 +16,7 @@ #define MCG_EXT_CNT_SHIFT 16 #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) #define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ +#define MCG_ELOG_P (1ULL<<26) /* Extended error log supported */ /* MCG_STATUS register defines */ #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ @@ -32,11 +33,20 @@ #define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ #define MCI_STATUS_AR (1ULL<<55) /* Action required */ -#define MCACOD 0xffff /* MCA Error Code */ + +/* + * Note that the full MCACOD field of IA32_MCi_STATUS MSR is + * bits 15:0. But bit 12 is the 'F' bit, defined for corrected + * errors to indicate that errors are being filtered by hardware. + * We should mask out bit 12 when looking for specific signatures + * of uncorrected errors - so the F bit is deliberately skipped + * in this #define. + */ +#define MCACOD 0xefff /* MCA Error Code */ /* Architecturally defined codes from SDM Vol. 3B Chapter 15 */ #define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */ -#define MCACOD_SCRUBMSK 0xfff0 +#define MCACOD_SCRUBMSK 0xeff0 /* Skip bit 12 ('F' bit) */ #define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */ #define MCACOD_DATA 0x0134 /* Data Load */ #define MCACOD_INSTR 0x0150 /* Instruction Fetch */ @@ -188,6 +198,9 @@ extern void register_mce_write_callback(ssize_t (*)(struct file *filp, const char __user *ubuf, size_t usize, loff_t *off)); +/* Disable CMCI/polling for MCA bank claimed by firmware */ +extern void mce_disable_bank(int bank); + /* * Exception handler */ diff --git a/arch/x86/include/asm/misc.h b/arch/x86/include/asm/misc.h new file mode 100644 index 00000000000..475f5bbc7f5 --- /dev/null +++ b/arch/x86/include/asm/misc.h @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_MISC_H +#define _ASM_X86_MISC_H + +int num_digits(int val); + +#endif /* _ASM_X86_MISC_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index cdbf3677610..be12c534fd5 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -45,22 +45,28 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Re-load page tables */ load_cr3(next->pgd); - /* stop flush ipis for the previous mm */ + /* Stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); - /* - * load the LDT, if the LDT is different: - */ + /* Load the LDT, if the LDT is different: */ if (unlikely(prev->context.ldt != next->context.ldt)) load_LDT_nolock(&next->context); } #ifdef CONFIG_SMP - else { + else { this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); - if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { - /* We were in lazy tlb mode and leave_mm disabled + if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * On established mms, the mm_cpumask is only changed + * from irq context, from ptep_clear_flush() while in + * lazy tlb mode, and here. Irqs are blocked during + * schedule, protecting us from simultaneous changes. + */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + /* + * We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. */ diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h deleted file mode 100644 index fc18bf3ce7c..00000000000 --- a/arch/x86/include/asm/mrst.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * mrst.h: Intel Moorestown platform specific setup code - * - * (C) Copyright 2009 Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ -#ifndef _ASM_X86_MRST_H -#define _ASM_X86_MRST_H - -#include <linux/sfi.h> - -extern int pci_mrst_init(void); -extern int __init sfi_parse_mrtc(struct sfi_table_header *table); -extern int sfi_mrtc_num; -extern struct sfi_rtc_table_entry sfi_mrtc_array[]; - -/* - * Medfield is the follow-up of Moorestown, it combines two chip solution into - * one. Other than that it also added always-on and constant tsc and lapic - * timers. Medfield is the platform name, and the chip name is called Penwell - * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be - * identified via MSRs. - */ -enum mrst_cpu_type { - /* 1 was Moorestown */ - MRST_CPU_CHIP_PENWELL = 2, -}; - -extern enum mrst_cpu_type __mrst_cpu_chip; - -#ifdef CONFIG_X86_INTEL_MID - -static inline enum mrst_cpu_type mrst_identify_cpu(void) -{ - return __mrst_cpu_chip; -} - -#else /* !CONFIG_X86_INTEL_MID */ - -#define mrst_identify_cpu() (0) - -#endif /* !CONFIG_X86_INTEL_MID */ - -enum mrst_timer_options { - MRST_TIMER_DEFAULT, - MRST_TIMER_APBT_ONLY, - MRST_TIMER_LAPIC_APBT, -}; - -extern enum mrst_timer_options mrst_timer_options; - -/* - * Penwell uses spread spectrum clock, so the freq number is not exactly - * the same as reported by MSR based on SDM. - */ -#define PENWELL_FSB_FREQ_83SKU 83200 -#define PENWELL_FSB_FREQ_100SKU 99840 - -#define SFI_MTMR_MAX_NUM 8 -#define SFI_MRTC_MAX 8 - -extern struct console early_mrst_console; -extern void mrst_early_console_init(void); - -extern struct console early_hsu_console; -extern void hsu_early_console_init(const char *); - -extern void intel_scu_devices_create(void); -extern void intel_scu_devices_destroy(void); - -/* VRTC timer */ -#define MRST_VRTC_MAP_SZ (1024) -/*#define MRST_VRTC_PGOFFSET (0xc00) */ - -extern void mrst_rtc_init(void); - -#endif /* _ASM_X86_MRST_H */ diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h index 2c543fff241..07537a44216 100644 --- a/arch/x86/include/asm/mutex_64.h +++ b/arch/x86/include/asm/mutex_64.h @@ -16,6 +16,20 @@ * * Atomically decrements @v and calls <fail_fn> if the result is negative. */ +#ifdef CC_HAVE_ASM_GOTO +static inline void __mutex_fastpath_lock(atomic_t *v, + void (*fail_fn)(atomic_t *)) +{ + asm_volatile_goto(LOCK_PREFIX " decl %0\n" + " jns %l[exit]\n" + : : "m" (v->counter) + : "memory", "cc" + : exit); + fail_fn(v); +exit: + return; +} +#else #define __mutex_fastpath_lock(v, fail_fn) \ do { \ unsigned long dummy; \ @@ -32,6 +46,7 @@ do { \ : "rax", "rsi", "rdx", "rcx", \ "r8", "r9", "r10", "r11", "memory"); \ } while (0) +#endif /** * __mutex_fastpath_lock_retval - try to take the lock by moving the count @@ -56,6 +71,20 @@ static inline int __mutex_fastpath_lock_retval(atomic_t *count) * * Atomically increments @v and calls <fail_fn> if the result is nonpositive. */ +#ifdef CC_HAVE_ASM_GOTO +static inline void __mutex_fastpath_unlock(atomic_t *v, + void (*fail_fn)(atomic_t *)) +{ + asm_volatile_goto(LOCK_PREFIX " incl %0\n" + " jg %l[exit]\n" + : : "m" (v->counter) + : "memory", "cc" + : exit); + fail_fn(v); +exit: + return; +} +#else #define __mutex_fastpath_unlock(v, fail_fn) \ do { \ unsigned long dummy; \ @@ -72,6 +101,7 @@ do { \ : "rax", "rsi", "rdx", "rcx", \ "r8", "r9", "r10", "r11", "memory"); \ } while (0) +#endif #define __mutex_slowpath_needs_to_unlock() 1 diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index ef17af01347..f48b17df422 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -15,6 +15,8 @@ */ #define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) +#define __START_KERNEL_map __PAGE_OFFSET + #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 6c896fbe21d..43dcd804ebd 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -32,11 +32,6 @@ */ #define __PAGE_OFFSET _AC(0xffff880000000000, UL) -#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \ - (CONFIG_PHYSICAL_ALIGN - 1)) & \ - ~(CONFIG_PHYSICAL_ALIGN - 1)) - -#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) #define __START_KERNEL_map _AC(0xffffffff80000000, UL) /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 54c97879195..f97fbe3abb6 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -33,6 +33,11 @@ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define __PHYSICAL_START ALIGN(CONFIG_PHYSICAL_START, \ + CONFIG_PHYSICAL_ALIGN) + +#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) + #ifdef CONFIG_X86_64 #include <asm/page_64_types.h> #else diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cfdc9ee4c90..401f350ef71 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -712,36 +712,16 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) -static inline int arch_spin_is_locked(struct arch_spinlock *lock) +static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, + __ticket_t ticket) { - return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); + PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket); } -static inline int arch_spin_is_contended(struct arch_spinlock *lock) +static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, + __ticket_t ticket) { - return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); -} -#define arch_spin_is_contended arch_spin_is_contended - -static __always_inline void arch_spin_lock(struct arch_spinlock *lock) -{ - PVOP_VCALL1(pv_lock_ops.spin_lock, lock); -} - -static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock, - unsigned long flags) -{ - PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags); -} - -static __always_inline int arch_spin_trylock(struct arch_spinlock *lock) -{ - return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock); -} - -static __always_inline void arch_spin_unlock(struct arch_spinlock *lock) -{ - PVOP_VCALL1(pv_lock_ops.spin_unlock, lock); + PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); } #endif diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 0db1fcac668..aab8f671b52 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -327,13 +327,15 @@ struct pv_mmu_ops { }; struct arch_spinlock; +#ifdef CONFIG_SMP +#include <asm/spinlock_types.h> +#else +typedef u16 __ticket_t; +#endif + struct pv_lock_ops { - int (*spin_is_locked)(struct arch_spinlock *lock); - int (*spin_is_contended)(struct arch_spinlock *lock); - void (*spin_lock)(struct arch_spinlock *lock); - void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long flags); - int (*spin_trylock)(struct arch_spinlock *lock); - void (*spin_unlock)(struct arch_spinlock *lock); + struct paravirt_callee_save lock_spinning; + void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); }; /* This contains all the paravirt structures: we get a convenient @@ -387,7 +389,8 @@ extern struct pv_lock_ops pv_lock_ops; /* Simple instruction patching code. */ #define DEF_NATIVE(ops, name, code) \ - extern const char start_##ops##_##name[], end_##ops##_##name[]; \ + extern const char start_##ops##_##name[] __visible, \ + end_##ops##_##name[] __visible; \ asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") unsigned paravirt_patch_nop(void); diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index d9e9e6c7ed3..7d7443283a9 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -100,29 +100,6 @@ static inline void early_quirks(void) { } extern void pci_iommu_alloc(void); #ifdef CONFIG_PCI_MSI -/* MSI arch specific hooks */ -static inline int x86_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - return x86_msi.setup_msi_irqs(dev, nvec, type); -} - -static inline void x86_teardown_msi_irqs(struct pci_dev *dev) -{ - x86_msi.teardown_msi_irqs(dev); -} - -static inline void x86_teardown_msi_irq(unsigned int irq) -{ - x86_msi.teardown_msi_irq(irq); -} -static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq) -{ - x86_msi.restore_msi_irqs(dev, irq); -} -#define arch_setup_msi_irqs x86_setup_msi_irqs -#define arch_teardown_msi_irqs x86_teardown_msi_irqs -#define arch_teardown_msi_irq x86_teardown_msi_irq -#define arch_restore_msi_irqs x86_restore_msi_irqs /* implemented in arch/x86/kernel/apic/io_apic. */ struct msi_desc; int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); @@ -130,16 +107,9 @@ void native_teardown_msi_irq(unsigned int irq); void native_restore_msi_irqs(struct pci_dev *dev, int irq); int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, unsigned int irq_base, unsigned int irq_offset); -/* default to the implementation in drivers/lib/msi.c */ -#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS -#define HAVE_DEFAULT_MSI_RESTORE_IRQS -void default_teardown_msi_irqs(struct pci_dev *dev); -void default_restore_msi_irqs(struct pci_dev *dev, int irq); #else #define native_setup_msi_irqs NULL #define native_teardown_msi_irq NULL -#define default_teardown_msi_irqs NULL -#define default_restore_msi_irqs NULL #endif #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 0da5200ee79..b3e18f80030 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -128,7 +128,8 @@ do { \ do { \ typedef typeof(var) pao_T__; \ const int pao_ID__ = (__builtin_constant_p(val) && \ - ((val) == 1 || (val) == -1)) ? (val) : 0; \ + ((val) == 1 || (val) == -1)) ? \ + (int)(val) : 0; \ if (0) { \ pao_T__ pao_tmp__; \ pao_tmp__ = (val); \ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1c00631164c..3d199945870 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -22,7 +22,8 @@ * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] + __visible; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) extern spinlock_t pgd_lock; @@ -314,21 +315,6 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } -static inline pte_t pte_swp_mksoft_dirty(pte_t pte) -{ - return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); -} - -static inline int pte_swp_soft_dirty(pte_t pte) -{ - return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; -} - -static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) -{ - return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); -} - static inline pte_t pte_file_clear_soft_dirty(pte_t pte) { return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); @@ -445,6 +431,7 @@ pte_t *populate_extra_pte(unsigned long vaddr); #ifndef __ASSEMBLY__ #include <linux/mm_types.h> +#include <linux/mmdebug.h> #include <linux/log2.h> static inline int pte_none(pte_t pte) @@ -863,6 +850,24 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, { } +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + VM_BUG_ON(pte_present(pte)); + return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + +static inline int pte_swp_soft_dirty(pte_t pte) +{ + VM_BUG_ON(pte_present(pte)); + return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; +} + +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + VM_BUG_ON(pte_present(pte)); + return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + #include <asm-generic/pgtable.h> #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index f4843e03113..0ecac257fb2 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -75,6 +75,9 @@ * with swap entry format. On x86 bits 6 and 7 are *not* involved * into swap entry computation, but bit 6 is used for nonlinear * file mapping, so we borrow bit 7 for soft dirty tracking. + * + * Please note that this bit must be treated as swap dirty page + * mark if and only if the PTE has present bit clear! */ #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h new file mode 100644 index 00000000000..8729723636f --- /dev/null +++ b/arch/x86/include/asm/preempt.h @@ -0,0 +1,100 @@ +#ifndef __ASM_PREEMPT_H +#define __ASM_PREEMPT_H + +#include <asm/rmwcc.h> +#include <asm/percpu.h> +#include <linux/thread_info.h> + +DECLARE_PER_CPU(int, __preempt_count); + +/* + * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users + * that think a non-zero value indicates we cannot preempt. + */ +static __always_inline int preempt_count(void) +{ + return __this_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; +} + +static __always_inline void preempt_count_set(int pc) +{ + __this_cpu_write_4(__preempt_count, pc); +} + +/* + * must be macros to avoid header recursion hell + */ +#define task_preempt_count(p) \ + (task_thread_info(p)->saved_preempt_count & ~PREEMPT_NEED_RESCHED) + +#define init_task_preempt_count(p) do { \ + task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ +} while (0) + +#define init_idle_preempt_count(p, cpu) do { \ + task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \ + per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \ +} while (0) + +/* + * We fold the NEED_RESCHED bit into the preempt count such that + * preempt_enable() can decrement and test for needing to reschedule with a + * single instruction. + * + * We invert the actual bit, so that when the decrement hits 0 we know we both + * need to resched (the bit is cleared) and can resched (no preempt count). + */ + +static __always_inline void set_preempt_need_resched(void) +{ + __this_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); +} + +static __always_inline void clear_preempt_need_resched(void) +{ + __this_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); +} + +static __always_inline bool test_preempt_need_resched(void) +{ + return !(__this_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); +} + +/* + * The various preempt_count add/sub methods + */ + +static __always_inline void __preempt_count_add(int val) +{ + __this_cpu_add_4(__preempt_count, val); +} + +static __always_inline void __preempt_count_sub(int val) +{ + __this_cpu_add_4(__preempt_count, -val); +} + +static __always_inline bool __preempt_count_dec_and_test(void) +{ + GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e"); +} + +/* + * Returns true when we need to resched and can (barring IRQ state). + */ +static __always_inline bool should_resched(void) +{ + return unlikely(!__this_cpu_read_4(__preempt_count)); +} + +#ifdef CONFIG_PREEMPT + extern asmlinkage void ___preempt_schedule(void); +# define __preempt_schedule() asm ("call ___preempt_schedule") + extern asmlinkage void preempt_schedule(void); +# ifdef CONFIG_CONTEXT_TRACKING + extern asmlinkage void ___preempt_schedule_context(void); +# define __preempt_schedule_context() asm ("call ___preempt_schedule_context") +# endif +#endif + +#endif /* __ASM_PREEMPT_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 24cf5aefb70..987c75ecc33 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -412,7 +412,7 @@ union irq_stack_union { }; }; -DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union); +DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; DECLARE_INIT_PER_CPU(irq_stack_union); DECLARE_PER_CPU(char *, irq_stack_ptr); @@ -942,33 +942,19 @@ extern int set_tsc_mode(unsigned int val); extern u16 amd_get_nb_id(int cpu); -struct aperfmperf { - u64 aperf, mperf; -}; - -static inline void get_aperfmperf(struct aperfmperf *am) +static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) { - WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF)); - - rdmsrl(MSR_IA32_APERF, am->aperf); - rdmsrl(MSR_IA32_MPERF, am->mperf); -} + uint32_t base, eax, signature[3]; -#define APERFMPERF_SHIFT 10 + for (base = 0x40000000; base < 0x40010000; base += 0x100) { + cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); -static inline -unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, - struct aperfmperf *new) -{ - u64 aperf = new->aperf - old->aperf; - u64 mperf = new->mperf - old->mperf; - unsigned long ratio = aperf; - - mperf >>= APERFMPERF_SHIFT; - if (mperf) - ratio = div64_u64(aperf, mperf); + if (!memcmp(sig, signature, 12) && + (leaves == 0 || ((eax - base) >= leaves))) + return base; + } - return ratio; + return 0; } extern unsigned long arch_align_stack(unsigned long sp); diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 109a9dd5d45..be8269b00e2 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, struct pvclock_vsyscall_time_info { struct pvclock_vcpu_time_info pvti; - u32 migrate_count; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h new file mode 100644 index 00000000000..1ff990f1de8 --- /dev/null +++ b/arch/x86/include/asm/rmwcc.h @@ -0,0 +1,41 @@ +#ifndef _ASM_X86_RMWcc +#define _ASM_X86_RMWcc + +#ifdef CC_HAVE_ASM_GOTO + +#define __GEN_RMWcc(fullop, var, cc, ...) \ +do { \ + asm_volatile_goto (fullop "; j" cc " %l[cc_label]" \ + : : "m" (var), ## __VA_ARGS__ \ + : "memory" : cc_label); \ + return 0; \ +cc_label: \ + return 1; \ +} while (0) + +#define GEN_UNARY_RMWcc(op, var, arg0, cc) \ + __GEN_RMWcc(op " " arg0, var, cc) + +#define GEN_BINARY_RMWcc(op, var, val, arg0, cc) \ + __GEN_RMWcc(op " %1, " arg0, var, cc, "er" (val)) + +#else /* !CC_HAVE_ASM_GOTO */ + +#define __GEN_RMWcc(fullop, var, cc, ...) \ +do { \ + char c; \ + asm volatile (fullop "; set" cc " %1" \ + : "+m" (var), "=qm" (c) \ + : __VA_ARGS__ : "memory"); \ + return c != 0; \ +} while (0) + +#define GEN_UNARY_RMWcc(op, var, arg0, cc) \ + __GEN_RMWcc(op " " arg0, var, cc) + +#define GEN_BINARY_RMWcc(op, var, val, arg0, cc) \ + __GEN_RMWcc(op " %2, " arg0, var, cc, "er" (val)) + +#endif /* CC_HAVE_ASM_GOTO */ + +#endif /* _ASM_X86_RMWcc */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index b7bf3505e1e..59bcf4e2241 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -6,6 +6,8 @@ #define COMMAND_LINE_SIZE 2048 +#include <linux/linkage.h> + #ifdef __i386__ #include <linux/pfn.h> @@ -49,9 +51,9 @@ extern void i386_reserve_resources(void); extern void setup_default_timer_irq(void); #ifdef CONFIG_X86_INTEL_MID -extern void x86_mrst_early_setup(void); +extern void x86_intel_mid_early_setup(void); #else -static inline void x86_mrst_early_setup(void) { } +static inline void x86_intel_mid_early_setup(void) { } #endif #ifdef CONFIG_X86_INTEL_CE @@ -108,11 +110,11 @@ void *extend_brk(size_t size, size_t align); extern void probe_roms(void); #ifdef __i386__ -void __init i386_start_kernel(void); +asmlinkage void __init i386_start_kernel(void); #else -void __init x86_64_start_kernel(char *real_mode); -void __init x86_64_start_reservations(char *real_mode_data); +asmlinkage void __init x86_64_start_kernel(char *real_mode); +asmlinkage void __init x86_64_start_reservations(char *real_mode_data); #endif /* __i386__ */ #endif /* _SETUP */ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 2f4d924fe6c..645cad2c95f 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -101,7 +101,7 @@ static inline void native_wbinvd(void) asm volatile("wbinvd": : :"memory"); } -extern void native_load_gs_index(unsigned); +extern asmlinkage void native_load_gs_index(unsigned); #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index e3ddd7db723..bf156ded74b 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -1,11 +1,14 @@ #ifndef _ASM_X86_SPINLOCK_H #define _ASM_X86_SPINLOCK_H +#include <linux/jump_label.h> #include <linux/atomic.h> #include <asm/page.h> #include <asm/processor.h> #include <linux/compiler.h> #include <asm/paravirt.h> +#include <asm/bitops.h> + /* * Your basic SMP spinlocks, allowing only a single CPU anywhere * @@ -34,6 +37,36 @@ # define UNLOCK_LOCK_PREFIX #endif +/* How long a lock should spin before we consider blocking */ +#define SPIN_THRESHOLD (1 << 15) + +extern struct static_key paravirt_ticketlocks_enabled; +static __always_inline bool static_key_false(struct static_key *key); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + +static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) +{ + set_bit(0, (volatile unsigned long *)&lock->tickets.tail); +} + +#else /* !CONFIG_PARAVIRT_SPINLOCKS */ +static __always_inline void __ticket_lock_spinning(arch_spinlock_t *lock, + __ticket_t ticket) +{ +} +static inline void __ticket_unlock_kick(arch_spinlock_t *lock, + __ticket_t ticket) +{ +} + +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) +{ + return lock.tickets.head == lock.tickets.tail; +} + /* * Ticket locks are conceptually two parts, one indicating the current head of * the queue, and the other indicating the current tail. The lock is acquired @@ -47,81 +80,101 @@ * in the high part, because a wide xadd increment of the low part would carry * up and contaminate the high part. */ -static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) +static __always_inline void arch_spin_lock(arch_spinlock_t *lock) { - register struct __raw_tickets inc = { .tail = 1 }; + register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC }; inc = xadd(&lock->tickets, inc); + if (likely(inc.head == inc.tail)) + goto out; + inc.tail &= ~TICKET_SLOWPATH_FLAG; for (;;) { - if (inc.head == inc.tail) - break; - cpu_relax(); - inc.head = ACCESS_ONCE(lock->tickets.head); + unsigned count = SPIN_THRESHOLD; + + do { + if (ACCESS_ONCE(lock->tickets.head) == inc.tail) + goto out; + cpu_relax(); + } while (--count); + __ticket_lock_spinning(lock, inc.tail); } - barrier(); /* make sure nothing creeps before the lock is taken */ +out: barrier(); /* make sure nothing creeps before the lock is taken */ } -static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) +static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) { arch_spinlock_t old, new; old.tickets = ACCESS_ONCE(lock->tickets); - if (old.tickets.head != old.tickets.tail) + if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG)) return 0; - new.head_tail = old.head_tail + (1 << TICKET_SHIFT); + new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT); /* cmpxchg is a full barrier, so nothing can move before it */ return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; } -static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) +static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock, + arch_spinlock_t old) { - __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX); + arch_spinlock_t new; + + BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); + + /* Perform the unlock on the "before" copy */ + old.tickets.head += TICKET_LOCK_INC; + + /* Clear the slowpath flag */ + new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT); + + /* + * If the lock is uncontended, clear the flag - use cmpxchg in + * case it changes behind our back though. + */ + if (new.tickets.head != new.tickets.tail || + cmpxchg(&lock->head_tail, old.head_tail, + new.head_tail) != old.head_tail) { + /* + * Lock still has someone queued for it, so wake up an + * appropriate waiter. + */ + __ticket_unlock_kick(lock, old.tickets.head); + } } -static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) +static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + if (TICKET_SLOWPATH_FLAG && + static_key_false(¶virt_ticketlocks_enabled)) { + arch_spinlock_t prev; - return tmp.tail != tmp.head; -} + prev = *lock; + add_smp(&lock->tickets.head, TICKET_LOCK_INC); -static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) -{ - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + /* add_smp() is a full mb() */ - return (__ticket_t)(tmp.tail - tmp.head) > 1; + if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) + __ticket_unlock_slowpath(lock, prev); + } else + __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX); } -#ifndef CONFIG_PARAVIRT_SPINLOCKS - static inline int arch_spin_is_locked(arch_spinlock_t *lock) { - return __ticket_spin_is_locked(lock); -} - -static inline int arch_spin_is_contended(arch_spinlock_t *lock) -{ - return __ticket_spin_is_contended(lock); -} -#define arch_spin_is_contended arch_spin_is_contended + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); -static __always_inline void arch_spin_lock(arch_spinlock_t *lock) -{ - __ticket_spin_lock(lock); + return tmp.tail != tmp.head; } -static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) +static inline int arch_spin_is_contended(arch_spinlock_t *lock) { - return __ticket_spin_trylock(lock); -} + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); -static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) -{ - __ticket_spin_unlock(lock); + return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC; } +#define arch_spin_is_contended arch_spin_is_contended static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) @@ -129,8 +182,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, arch_spin_lock(lock); } -#endif /* CONFIG_PARAVIRT_SPINLOCKS */ - static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { while (arch_spin_is_locked(lock)) diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index ad0ad07fc00..4f1bea19945 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -1,13 +1,17 @@ #ifndef _ASM_X86_SPINLOCK_TYPES_H #define _ASM_X86_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H -# error "please don't include this file directly" -#endif - #include <linux/types.h> -#if (CONFIG_NR_CPUS < 256) +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#define __TICKET_LOCK_INC 2 +#define TICKET_SLOWPATH_FLAG ((__ticket_t)1) +#else +#define __TICKET_LOCK_INC 1 +#define TICKET_SLOWPATH_FLAG ((__ticket_t)0) +#endif + +#if (CONFIG_NR_CPUS < (256 / __TICKET_LOCK_INC)) typedef u8 __ticket_t; typedef u16 __ticketpair_t; #else @@ -15,6 +19,8 @@ typedef u16 __ticket_t; typedef u32 __ticketpair_t; #endif +#define TICKET_LOCK_INC ((__ticket_t)__TICKET_LOCK_INC) + #define TICKET_SHIFT (sizeof(__ticket_t) * 8) typedef struct arch_spinlock { diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 4ec45b3abba..d7f3b3b78ac 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -2,8 +2,8 @@ #define _ASM_X86_SWITCH_TO_H struct task_struct; /* one of the stranger aspects of C forward declarations */ -struct task_struct *__switch_to(struct task_struct *prev, - struct task_struct *next); +__visible struct task_struct *__switch_to(struct task_struct *prev, + struct task_struct *next); struct tss_struct; void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index 9d09b4073b6..05af3b31d52 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -26,9 +26,9 @@ * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void sync_set_bit(int nr, volatile unsigned long *addr) +static inline void sync_set_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btsl %1,%0" + asm volatile("lock; bts %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -44,9 +44,9 @@ static inline void sync_set_bit(int nr, volatile unsigned long *addr) * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() * in order to ensure changes are visible on other processors. */ -static inline void sync_clear_bit(int nr, volatile unsigned long *addr) +static inline void sync_clear_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btrl %1,%0" + asm volatile("lock; btr %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -61,9 +61,9 @@ static inline void sync_clear_bit(int nr, volatile unsigned long *addr) * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void sync_change_bit(int nr, volatile unsigned long *addr) +static inline void sync_change_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btcl %1,%0" + asm volatile("lock; btc %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -77,11 +77,11 @@ static inline void sync_change_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr) +static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) { int oldbit; - asm volatile("lock; btsl %2,%1\n\tsbbl %0,%0" + asm volatile("lock; bts %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; @@ -95,11 +95,11 @@ static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr) +static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) { int oldbit; - asm volatile("lock; btrl %2,%1\n\tsbbl %0,%0" + asm volatile("lock; btr %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; @@ -113,11 +113,11 @@ static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int sync_test_and_change_bit(int nr, volatile unsigned long *addr) +static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr) { int oldbit; - asm volatile("lock; btcl %2,%1\n\tsbbl %0,%0" + asm volatile("lock; btc %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 2e188d68397..aea284b4131 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -20,7 +20,8 @@ #include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/unistd.h> -extern const unsigned long sys_call_table[]; +typedef void (*sys_call_ptr_t)(void); +extern const sys_call_ptr_t sys_call_table[]; /* * Only the low 32 bits of orig_ax are meaningful, so we return int. diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 2917a6452c4..592a6a672e0 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -24,7 +24,7 @@ asmlinkage long sys_iopl(unsigned int); asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); /* kernel/signal.c */ -long sys_rt_sigreturn(void); +asmlinkage long sys_rt_sigreturn(void); /* kernel/tls.c */ asmlinkage long sys_set_thread_area(struct user_desc __user *); @@ -34,7 +34,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *); #ifdef CONFIG_X86_32 /* kernel/signal.c */ -unsigned long sys_sigreturn(void); +asmlinkage unsigned long sys_sigreturn(void); /* kernel/vm86_32.c */ asmlinkage long sys_vm86old(struct vm86_struct __user *); @@ -44,7 +44,7 @@ asmlinkage long sys_vm86(unsigned long, unsigned long); /* X86_64 only */ /* kernel/process_64.c */ -long sys_arch_prctl(int, unsigned long); +asmlinkage long sys_arch_prctl(int, unsigned long); /* kernel/sys_x86_64.c */ asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, diff --git a/arch/x86/include/asm/sysfb.h b/arch/x86/include/asm/sysfb.h new file mode 100644 index 00000000000..2aeb3e25579 --- /dev/null +++ b/arch/x86/include/asm/sysfb.h @@ -0,0 +1,98 @@ +#ifndef _ARCH_X86_KERNEL_SYSFB_H +#define _ARCH_X86_KERNEL_SYSFB_H + +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <linux/kernel.h> +#include <linux/platform_data/simplefb.h> +#include <linux/screen_info.h> + +enum { + M_I17, /* 17-Inch iMac */ + M_I20, /* 20-Inch iMac */ + M_I20_SR, /* 20-Inch iMac (Santa Rosa) */ + M_I24, /* 24-Inch iMac */ + M_I24_8_1, /* 24-Inch iMac, 8,1th gen */ + M_I24_10_1, /* 24-Inch iMac, 10,1th gen */ + M_I27_11_1, /* 27-Inch iMac, 11,1th gen */ + M_MINI, /* Mac Mini */ + M_MINI_3_1, /* Mac Mini, 3,1th gen */ + M_MINI_4_1, /* Mac Mini, 4,1th gen */ + M_MB, /* MacBook */ + M_MB_2, /* MacBook, 2nd rev. */ + M_MB_3, /* MacBook, 3rd rev. */ + M_MB_5_1, /* MacBook, 5th rev. */ + M_MB_6_1, /* MacBook, 6th rev. */ + M_MB_7_1, /* MacBook, 7th rev. */ + M_MB_SR, /* MacBook, 2nd gen, (Santa Rosa) */ + M_MBA, /* MacBook Air */ + M_MBA_3, /* Macbook Air, 3rd rev */ + M_MBP, /* MacBook Pro */ + M_MBP_2, /* MacBook Pro 2nd gen */ + M_MBP_2_2, /* MacBook Pro 2,2nd gen */ + M_MBP_SR, /* MacBook Pro (Santa Rosa) */ + M_MBP_4, /* MacBook Pro, 4th gen */ + M_MBP_5_1, /* MacBook Pro, 5,1th gen */ + M_MBP_5_2, /* MacBook Pro, 5,2th gen */ + M_MBP_5_3, /* MacBook Pro, 5,3rd gen */ + M_MBP_6_1, /* MacBook Pro, 6,1th gen */ + M_MBP_6_2, /* MacBook Pro, 6,2th gen */ + M_MBP_7_1, /* MacBook Pro, 7,1th gen */ + M_MBP_8_2, /* MacBook Pro, 8,2nd gen */ + M_UNKNOWN /* placeholder */ +}; + +struct efifb_dmi_info { + char *optname; + unsigned long base; + int stride; + int width; + int height; + int flags; +}; + +#ifdef CONFIG_EFI + +extern struct efifb_dmi_info efifb_dmi_list[]; +void sysfb_apply_efi_quirks(void); + +#else /* CONFIG_EFI */ + +static inline void sysfb_apply_efi_quirks(void) +{ +} + +#endif /* CONFIG_EFI */ + +#ifdef CONFIG_X86_SYSFB + +bool parse_mode(const struct screen_info *si, + struct simplefb_platform_data *mode); +int create_simplefb(const struct screen_info *si, + const struct simplefb_platform_data *mode); + +#else /* CONFIG_X86_SYSFB */ + +static inline bool parse_mode(const struct screen_info *si, + struct simplefb_platform_data *mode) +{ + return false; +} + +static inline int create_simplefb(const struct screen_info *si, + const struct simplefb_platform_data *mode) +{ + return -EINVAL; +} + +#endif /* CONFIG_X86_SYSFB */ + +#endif /* _ARCH_X86_KERNEL_SYSFB_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 27811190cbd..c46a46be1ec 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -28,8 +28,7 @@ struct thread_info { __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ - int preempt_count; /* 0 => preemptable, - <0 => BUG */ + int saved_preempt_count; mm_segment_t addr_limit; struct restart_block restart_block; void __user *sysenter_return; @@ -49,7 +48,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = INIT_PREEMPT_COUNT, \ + .saved_preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cf512003e66..e6d90babc24 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -62,6 +62,7 @@ static inline void __flush_tlb_all(void) static inline void __flush_tlb_one(unsigned long addr) { + count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } @@ -84,14 +85,38 @@ static inline void __flush_tlb_one(unsigned long addr) #ifndef CONFIG_SMP -#define flush_tlb() __flush_tlb() -#define flush_tlb_all() __flush_tlb_all() -#define local_flush_tlb() __flush_tlb() +/* "_up" is for UniProcessor. + * + * This is a helper for other header functions. *Not* intended to be called + * directly. All global TLB flushes need to either call this, or to bump the + * vm statistics themselves. + */ +static inline void __flush_tlb_up(void) +{ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + __flush_tlb(); +} + +static inline void flush_tlb_all(void) +{ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + __flush_tlb_all(); +} + +static inline void flush_tlb(void) +{ + __flush_tlb_up(); +} + +static inline void local_flush_tlb(void) +{ + __flush_tlb_up(); +} static inline void flush_tlb_mm(struct mm_struct *mm) { if (mm == current->active_mm) - __flush_tlb(); + __flush_tlb_up(); } static inline void flush_tlb_page(struct vm_area_struct *vma, @@ -105,14 +130,14 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { if (vma->vm_mm == current->active_mm) - __flush_tlb(); + __flush_tlb_up(); } static inline void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { if (mm == current->active_mm) - __flush_tlb(); + __flush_tlb_up(); } static inline void native_flush_tlb_others(const struct cpumask *cpumask, diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 095b21507b6..d35f24e231c 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -124,9 +124,6 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) - -/* indicates that pointers to the topology cpumask_t maps are valid */ -#define arch_provides_topology_pointers yes #endif static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 88eae2aec61..7036cb60cd8 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -6,11 +6,7 @@ #include <asm/debugreg.h> #include <asm/siginfo.h> /* TRAP_TRACE, ... */ -#ifdef CONFIG_X86_32 -#define dotraplinkage -#else -#define dotraplinkage asmlinkage -#endif +#define dotraplinkage __visible asmlinkage void divide_error(void); asmlinkage void debug(void); diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index c91e8b9d588..235be70d5bb 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -49,6 +49,7 @@ extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); +extern int check_tsc_disabled(void); extern unsigned long native_calibrate_tsc(void); extern int tsc_clocksource_reliable; diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 5ee26875bae..8ec57c07b12 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -153,16 +153,19 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) * Careful: we have to cast the result to the type of the pointer * for sign reasons. * - * The use of %edx as the register specifier is a bit of a + * The use of _ASM_DX as the register specifier is a bit of a * simplification, as gcc only cares about it as the starting point * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits * (%ecx being the next register in gcc's x86 register sequence), and * %rdx on 64 bits. + * + * Clang/LLVM cares about the size of the register, but still wants + * the base register for something that ends up being a pair. */ #define get_user(x, ptr) \ ({ \ int __ret_gu; \ - register __inttype(*(ptr)) __val_gu asm("%edx"); \ + register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ __chk_user_ptr(ptr); \ might_fault(); \ asm volatile("call __get_user_%P3" \ @@ -539,5 +542,103 @@ extern struct movsl_mask { # include <asm/uaccess_64.h> #endif +unsigned long __must_check _copy_from_user(void *to, const void __user *from, + unsigned n); +unsigned long __must_check _copy_to_user(void __user *to, const void *from, + unsigned n); + +#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS +# define copy_user_diag __compiletime_error +#else +# define copy_user_diag __compiletime_warning +#endif + +extern void copy_user_diag("copy_from_user() buffer size is too small") +copy_from_user_overflow(void); +extern void copy_user_diag("copy_to_user() buffer size is too small") +copy_to_user_overflow(void) __asm__("copy_from_user_overflow"); + +#undef copy_user_diag + +#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS + +extern void +__compiletime_warning("copy_from_user() buffer size is not provably correct") +__copy_from_user_overflow(void) __asm__("copy_from_user_overflow"); +#define __copy_from_user_overflow(size, count) __copy_from_user_overflow() + +extern void +__compiletime_warning("copy_to_user() buffer size is not provably correct") +__copy_to_user_overflow(void) __asm__("copy_from_user_overflow"); +#define __copy_to_user_overflow(size, count) __copy_to_user_overflow() + +#else + +static inline void +__copy_from_user_overflow(int size, unsigned long count) +{ + WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); +} + +#define __copy_to_user_overflow __copy_from_user_overflow + +#endif + +static inline unsigned long __must_check +copy_from_user(void *to, const void __user *from, unsigned long n) +{ + int sz = __compiletime_object_size(to); + + might_fault(); + + /* + * While we would like to have the compiler do the checking for us + * even in the non-constant size case, any false positives there are + * a problem (especially when DEBUG_STRICT_USER_COPY_CHECKS, but even + * without - the [hopefully] dangerous looking nature of the warning + * would make people go look at the respecitive call sites over and + * over again just to find that there's no problem). + * + * And there are cases where it's just not realistic for the compiler + * to prove the count to be in range. For example when multiple call + * sites of a helper function - perhaps in different source files - + * all doing proper range checking, yet the helper function not doing + * so again. + * + * Therefore limit the compile time checking to the constant size + * case, and do only runtime checking for non-constant sizes. + */ + + if (likely(sz < 0 || sz >= n)) + n = _copy_from_user(to, from, n); + else if(__builtin_constant_p(n)) + copy_from_user_overflow(); + else + __copy_from_user_overflow(sz, n); + + return n; +} + +static inline unsigned long __must_check +copy_to_user(void __user *to, const void *from, unsigned long n) +{ + int sz = __compiletime_object_size(from); + + might_fault(); + + /* See the comment in copy_from_user() above. */ + if (likely(sz < 0 || sz >= n)) + n = _copy_to_user(to, from, n); + else if(__builtin_constant_p(n)) + copy_to_user_overflow(); + else + __copy_to_user_overflow(sz, n); + + return n; +} + +#undef __copy_from_user_overflow +#undef __copy_to_user_overflow + #endif /* _ASM_X86_UACCESS_H */ diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 7f760a9f1f6..3c03a5de64d 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -184,33 +184,4 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from, return __copy_from_user_ll_nocache_nozero(to, from, n); } -unsigned long __must_check copy_to_user(void __user *to, - const void *from, unsigned long n); -unsigned long __must_check _copy_from_user(void *to, - const void __user *from, - unsigned long n); - - -extern void copy_from_user_overflow(void) -#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS - __compiletime_error("copy_from_user() buffer size is not provably correct") -#else - __compiletime_warning("copy_from_user() buffer size is not provably correct") -#endif -; - -static inline unsigned long __must_check copy_from_user(void *to, - const void __user *from, - unsigned long n) -{ - int sz = __compiletime_object_size(to); - - if (likely(sz == -1 || sz >= n)) - n = _copy_from_user(to, from, n); - else - copy_from_user_overflow(); - - return n; -} - #endif /* _ASM_X86_UACCESS_32_H */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 64476bb2a14..190413d0de5 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -46,36 +46,8 @@ copy_user_generic(void *to, const void *from, unsigned len) } __must_check unsigned long -_copy_to_user(void __user *to, const void *from, unsigned len); -__must_check unsigned long -_copy_from_user(void *to, const void __user *from, unsigned len); -__must_check unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len); -static inline unsigned long __must_check copy_from_user(void *to, - const void __user *from, - unsigned long n) -{ - int sz = __compiletime_object_size(to); - - might_fault(); - if (likely(sz == -1 || sz >= n)) - n = _copy_from_user(to, from, n); -#ifdef CONFIG_DEBUG_VM - else - WARN(1, "Buffer overflow detected!\n"); -#endif - return n; -} - -static __always_inline __must_check -int copy_to_user(void __user *dst, const void *src, unsigned size) -{ - might_fault(); - - return _copy_to_user(dst, src, size); -} - static __always_inline __must_check int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) { diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 6e5197910fd..3087ea9c5f2 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -35,7 +35,10 @@ typedef u8 uprobe_opcode_t; struct arch_uprobe { u16 fixups; - u8 insn[MAX_UINSN_BYTES]; + union { + u8 insn[MAX_UINSN_BYTES]; + u8 ixol[MAX_UINSN_BYTES]; + }; #ifdef CONFIG_X86_64 unsigned long rip_rela_target_address; #endif @@ -49,11 +52,4 @@ struct arch_uprobe_task { unsigned int saved_tf; }; -extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr); -extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs); -extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs); -extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); -extern int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data); -extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs); -extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs); #endif /* _ASM_UPROBES_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index f3e01a2cbaa..966502d4682 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -387,6 +387,7 @@ enum vmcs_field { #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_EPT_EXTENT_CONTEXT 1 #define VMX_EPT_EXTENT_GLOBAL 2 +#define VMX_EPT_EXTENT_SHIFT 24 #define VMX_EPT_EXECUTE_ONLY_BIT (1ull) #define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) @@ -394,6 +395,7 @@ enum vmcs_field { #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) +#define VMX_EPT_INVEPT_BIT (1ull << 20) #define VMX_EPT_AD_BIT (1ull << 21) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index de656ac2af4..d76ac40da20 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -35,7 +35,7 @@ #define DEFINE_VVAR(type, name) \ type name \ - __attribute__((section(".vvar_" #name), aligned(16))) + __attribute__((section(".vvar_" #name), aligned(16))) __visible #define VVAR(name) (*vvaraddr_ ## name) diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index ca842f2769e..608a79d5a46 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -7,6 +7,7 @@ enum ipi_vector { XEN_CALL_FUNCTION_SINGLE_VECTOR, XEN_SPIN_UNLOCK_VECTOR, XEN_IRQ_WORK_VECTOR, + XEN_NMI_VECTOR, XEN_NR_IPIS, }; diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 125f344f06a..d866959e568 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -40,21 +40,7 @@ extern struct start_info *xen_start_info; static inline uint32_t xen_cpuid_base(void) { - uint32_t base, eax, ebx, ecx, edx; - char signature[13]; - - for (base = 0x40000000; base < 0x40010000; base += 0x100) { - cpuid(base, &eax, &ebx, &ecx, &edx); - *(uint32_t *)(signature + 0) = ebx; - *(uint32_t *)(signature + 4) = ecx; - *(uint32_t *)(signature + 8) = edx; - signature[12] = 0; - - if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) - return base; - } - - return 0; + return hypervisor_cpuid_base("XenVMMXenVMM", 2); } #ifdef CONFIG_XEN diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 6aef9fbc09b..b913915e8e6 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -79,30 +79,38 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn) return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY; } -static inline unsigned long mfn_to_pfn(unsigned long mfn) +static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn) { unsigned long pfn; - int ret = 0; + int ret; if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; - if (unlikely(mfn >= machine_to_phys_nr)) { - pfn = ~0; - goto try_override; - } - pfn = 0; + if (unlikely(mfn >= machine_to_phys_nr)) + return ~0; + /* * The array access can fail (e.g., device space beyond end of RAM). * In such cases it doesn't matter what we return (we return garbage), * but we must handle the fault without crashing! */ ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); -try_override: - /* ret might be < 0 if there are no entries in the m2p for mfn */ if (ret < 0) - pfn = ~0; - else if (get_phys_to_machine(pfn) != mfn) + return ~0; + + return pfn; +} + +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned long pfn; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return mfn; + + pfn = mfn_to_pfn_no_overrides(mfn); + if (get_phys_to_machine(pfn) != mfn) { /* * If this appears to be a foreign mfn (because the pfn * doesn't map back to the mfn), then check the local override @@ -111,6 +119,7 @@ try_override: * m2p_find_override_pfn returns ~0 if it doesn't find anything. */ pfn = m2p_find_override_pfn(mfn, ~0); + } /* * pfn is ~0 if there are no entries in the m2p for mfn or if the diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h index 7ea79c5fa1f..492b29802f5 100644 --- a/arch/x86/include/asm/xor_avx.h +++ b/arch/x86/include/asm/xor_avx.h @@ -167,12 +167,12 @@ static struct xor_block_template xor_block_avx = { #define AVX_XOR_SPEED \ do { \ - if (cpu_has_avx) \ + if (cpu_has_avx && cpu_has_osxsave) \ xor_speed(&xor_block_avx); \ } while (0) #define AVX_SELECT(FASTEST) \ - (cpu_has_avx ? &xor_block_avx : FASTEST) + (cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST) #else diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index c15ddaf9071..9c3733c5f8f 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -158,7 +158,7 @@ enum { X86_SUBARCH_PC = 0, X86_SUBARCH_LGUEST, X86_SUBARCH_XEN, - X86_SUBARCH_MRST, + X86_SUBARCH_INTEL_MID, X86_SUBARCH_CE4100, X86_NR_SUBARCHS, }; diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index b80420bcd09..b8f1c0176cb 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -27,6 +27,19 @@ #define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) /* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ #define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) + +/* + * There is a single feature flag that signifies the presence of the MSR + * that can be used to retrieve both the local APIC Timer frequency as + * well as the TSC frequency. + */ + +/* Local APIC timer frequency MSR (HV_X64_MSR_APIC_FREQUENCY) is available */ +#define HV_X64_MSR_APIC_FREQUENCY_AVAILABLE (1 << 11) + +/* TSC frequency MSR (HV_X64_MSR_TSC_FREQUENCY) is available */ +#define HV_X64_MSR_TSC_FREQUENCY_AVAILABLE (1 << 11) + /* * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available @@ -136,6 +149,12 @@ /* MSR used to read the per-partition time reference counter */ #define HV_X64_MSR_TIME_REF_COUNT 0x40000020 +/* MSR used to retrieve the TSC frequency */ +#define HV_X64_MSR_TSC_FREQUENCY 0x40000022 + +/* MSR used to retrieve the local APIC timer frequency */ +#define HV_X64_MSR_APIC_FREQUENCY 0x40000023 + /* Define the virtual APIC registers */ #define HV_X64_MSR_EOI 0x40000070 #define HV_X64_MSR_ICR 0x40000071 diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 06fdbd987e9..94dc8ca434e 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -23,6 +23,7 @@ #define KVM_FEATURE_ASYNC_PF 4 #define KVM_FEATURE_STEAL_TIME 5 #define KVM_FEATURE_PV_EOI 6 +#define KVM_FEATURE_PV_UNHALT 7 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index d651082c7cf..0e79420376e 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -65,6 +65,7 @@ #define EXIT_REASON_EOI_INDUCED 45 #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 #define EXIT_REASON_PREEMPTION_TIMER 52 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 @@ -106,12 +107,13 @@ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ + { EXIT_REASON_INVEPT, "INVEPT" }, \ + { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \ { EXIT_REASON_WBINVD, "WBINVD" }, \ { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ { EXIT_REASON_INVD, "INVD" }, \ - { EXIT_REASON_INVPCID, "INVPCID" }, \ - { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" } + { EXIT_REASON_INVPCID, "INVPCID" } #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 88d99ea7772..9b0a34e2cd7 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,6 +36,8 @@ obj-y += tsc.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o +obj-$(CONFIG_PREEMPT) += preempt.o + obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o @@ -103,6 +105,9 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_OF) += devicetree.o obj-$(CONFIG_UPROBES) += uprobes.o +obj-y += sysfb.o +obj-$(CONFIG_X86_SYSFB) += sysfb_simplefb.o +obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 2627a81253e..40c76604199 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -67,6 +67,7 @@ EXPORT_SYMBOL(acpi_pci_disabled); int acpi_lapic; int acpi_ioapic; int acpi_strict; +int acpi_disable_cmcff; u8 acpi_sci_flags __initdata; int acpi_sci_override_gsi __initdata; @@ -141,16 +142,8 @@ static u32 irq_to_gsi(int irq) } /* - * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, - * to map the target physical address. The problem is that set_fixmap() - * provides a single page, and it is possible that the page is not - * sufficient. - * By using this area, we can map up to MAX_IO_APICS pages temporarily, - * i.e. until the next __va_range() call. - * - * Important Safety Note: The fixed I/O APIC page numbers are *subtracted* - * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and - * count idx down while incrementing the phys address. + * This is just a simple wrapper around early_ioremap(), + * with sanity checks for phys == 0 and size == 0. */ char *__init __acpi_map_table(unsigned long phys, unsigned long size) { @@ -160,6 +153,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) return early_ioremap(phys, size); } + void __init __acpi_unmap_table(char *map, unsigned long size) { if (!map || !size) @@ -199,7 +193,7 @@ static void acpi_register_lapic(int id, u8 enabled) { unsigned int ver = 0; - if (id >= (MAX_LOCAL_APIC-1)) { + if (id >= MAX_LOCAL_APIC) { printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); return; } @@ -1120,6 +1114,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) int ioapic; int ioapic_pin; struct io_apic_irq_attr irq_attr; + int ret; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -1149,7 +1144,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr); + ret = io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr); + if (ret < 0) + gsi = INT_MIN; return gsi; } @@ -1626,6 +1623,10 @@ static int __init parse_acpi(char *arg) /* "acpi=copy_dsdt" copys DSDT */ else if (strcmp(arg, "copy_dsdt") == 0) { acpi_gbl_copy_dsdt_locally = 1; + } + /* "acpi=nocmcff" disables FF mode for corrected errors */ + else if (strcmp(arg, "nocmcff") == 0) { + acpi_disable_cmcff = 1; } else { /* Core will printk when we return error. */ return -EINVAL; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c15cf9a25e2..15e8563e5c2 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -11,6 +11,7 @@ #include <linux/memory.h> #include <linux/stop_machine.h> #include <linux/slab.h> +#include <linux/kdebug.h> #include <asm/alternative.h> #include <asm/sections.h> #include <asm/pgtable.h> @@ -596,97 +597,93 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) return addr; } -/* - * Cross-modifying kernel text with stop_machine(). - * This code originally comes from immediate value. - */ -static atomic_t stop_machine_first; -static int wrote_text; +static void do_sync_core(void *info) +{ + sync_core(); +} -struct text_poke_params { - struct text_poke_param *params; - int nparams; -}; +static bool bp_patching_in_progress; +static void *bp_int3_handler, *bp_int3_addr; -static int __kprobes stop_machine_text_poke(void *data) +int poke_int3_handler(struct pt_regs *regs) { - struct text_poke_params *tpp = data; - struct text_poke_param *p; - int i; + /* bp_patching_in_progress */ + smp_rmb(); - if (atomic_xchg(&stop_machine_first, 0)) { - for (i = 0; i < tpp->nparams; i++) { - p = &tpp->params[i]; - text_poke(p->addr, p->opcode, p->len); - } - smp_wmb(); /* Make sure other cpus see that this has run */ - wrote_text = 1; - } else { - while (!wrote_text) - cpu_relax(); - smp_mb(); /* Load wrote_text before following execution */ - } + if (likely(!bp_patching_in_progress)) + return 0; - for (i = 0; i < tpp->nparams; i++) { - p = &tpp->params[i]; - flush_icache_range((unsigned long)p->addr, - (unsigned long)p->addr + p->len); - } - /* - * Intel Archiecture Software Developer's Manual section 7.1.3 specifies - * that a core serializing instruction such as "cpuid" should be - * executed on _each_ core before the new instruction is made visible. - */ - sync_core(); - return 0; -} + if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) + return 0; + + /* set up the specified breakpoint handler */ + regs->ip = (unsigned long) bp_int3_handler; + + return 1; -/** - * text_poke_smp - Update instructions on a live kernel on SMP - * @addr: address to modify - * @opcode: source of the copy - * @len: length to copy - * - * Modify multi-byte instruction by using stop_machine() on SMP. This allows - * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying - * should be allowed, since stop_machine() does _not_ protect code against - * NMI and MCE. - * - * Note: Must be called under get_online_cpus() and text_mutex. - */ -void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) -{ - struct text_poke_params tpp; - struct text_poke_param p; - - p.addr = addr; - p.opcode = opcode; - p.len = len; - tpp.params = &p; - tpp.nparams = 1; - atomic_set(&stop_machine_first, 1); - wrote_text = 0; - /* Use __stop_machine() because the caller already got online_cpus. */ - __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask); - return addr; } /** - * text_poke_smp_batch - Update instructions on a live kernel on SMP - * @params: an array of text_poke parameters - * @n: the number of elements in params. + * text_poke_bp() -- update instructions on live kernel on SMP + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @handler: address to jump to when the temporary breakpoint is hit * - * Modify multi-byte instruction by using stop_machine() on SMP. Since the - * stop_machine() is heavy task, it is better to aggregate text_poke requests - * and do it once if possible. + * Modify multi-byte instruction by using int3 breakpoint on SMP. + * We completely avoid stop_machine() here, and achieve the + * synchronization using int3 breakpoint. * - * Note: Must be called under get_online_cpus() and text_mutex. + * The way it is done: + * - add a int3 trap to the address that will be patched + * - sync cores + * - update all but the first byte of the patched range + * - sync cores + * - replace the first byte (int3) by the first byte of + * replacing opcode + * - sync cores + * + * Note: must be called under text_mutex. */ -void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) +void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) { - struct text_poke_params tpp = {.params = params, .nparams = n}; + unsigned char int3 = 0xcc; + + bp_int3_handler = handler; + bp_int3_addr = (u8 *)addr + sizeof(int3); + bp_patching_in_progress = true; + /* + * Corresponding read barrier in int3 notifier for + * making sure the in_progress flags is correctly ordered wrt. + * patching + */ + smp_wmb(); + + text_poke(addr, &int3, sizeof(int3)); - atomic_set(&stop_machine_first, 1); - wrote_text = 0; - __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask); + on_each_cpu(do_sync_core, NULL, 1); + + if (len - sizeof(int3) > 0) { + /* patch all but the first byte */ + text_poke((char *)addr + sizeof(int3), + (const char *) opcode + sizeof(int3), + len - sizeof(int3)); + /* + * According to Intel, this core syncing is very likely + * not necessary and we'd be safe even without it. But + * better safe than sorry (plus there's not only Intel). + */ + on_each_cpu(do_sync_core, NULL, 1); + } + + /* patch the first byte */ + text_poke(addr, opcode, sizeof(int3)); + + on_each_cpu(do_sync_core, NULL, 1); + + bp_patching_in_progress = false; + smp_wmb(); + + return addr; } + diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 3048ded1b59..59554dca96e 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -20,6 +20,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, {} }; @@ -27,6 +28,7 @@ EXPORT_SYMBOL(amd_nb_misc_ids); static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, {} }; @@ -81,13 +83,20 @@ int amd_cache_northbridges(void) next_northbridge(misc, amd_nb_misc_ids); node_to_amd_nb(i)->link = link = next_northbridge(link, amd_nb_link_ids); - } + } + /* GART present only on Fam15h upto model 0fh */ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || - boot_cpu_data.x86 == 0x15) + (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10)) amd_northbridges.flags |= AMD_NB_GART; /* + * Check for L3 cache presence. + */ + if (!cpuid_edx(0x80000006)) + return 0; + + /* * Some CPU families support L3 Cache Index Disable. There are some * limitations because of E382 and E388 on family 0x10. */ diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index c9876efecaf..af5b08ab3b7 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -40,7 +40,7 @@ #include <asm/fixmap.h> #include <asm/apb_timer.h> -#include <asm/mrst.h> +#include <asm/intel-mid.h> #include <asm/time.h> #define APBT_CLOCKEVENT_RATING 110 @@ -157,13 +157,13 @@ static int __init apbt_clockevent_register(void) adev->num = smp_processor_id(); adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0", - mrst_timer_options == MRST_TIMER_LAPIC_APBT ? + intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ? APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING, adev_virt_addr(adev), 0, apbt_freq); /* Firmware does EOI handling for us. */ adev->timer->eoi = NULL; - if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { + if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) { global_clock_event = &adev->timer->ced; printk(KERN_DEBUG "%s clockevent registered as global\n", global_clock_event->name); @@ -253,7 +253,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n, static __init int apbt_late_init(void) { - if (mrst_timer_options == MRST_TIMER_LAPIC_APBT || + if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT || !apb_timer_block_enabled) return 0; /* This notifier should be called after workqueue is ready */ @@ -340,7 +340,7 @@ void __init apbt_time_init(void) } #ifdef CONFIG_SMP /* kernel cmdline disable apb timer, so we will use lapic timers */ - if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { + if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) { printk(KERN_INFO "apbt: disabled per cpu timer\n"); return; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index eca89c53a7f..a7eb82d9b01 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -913,7 +913,7 @@ static void local_apic_timer_interrupt(void) * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ -void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -932,7 +932,7 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) set_irq_regs(old_regs); } -void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -1946,14 +1946,14 @@ static inline void __smp_spurious_interrupt(void) "should never happen.\n", smp_processor_id()); } -void smp_spurious_interrupt(struct pt_regs *regs) +__visible void smp_spurious_interrupt(struct pt_regs *regs) { entering_irq(); __smp_spurious_interrupt(); exiting_irq(); } -void smp_trace_spurious_interrupt(struct pt_regs *regs) +__visible void smp_trace_spurious_interrupt(struct pt_regs *regs) { entering_irq(); trace_spurious_apic_entry(SPURIOUS_APIC_VECTOR); @@ -2002,14 +2002,14 @@ static inline void __smp_error_interrupt(struct pt_regs *regs) } -void smp_error_interrupt(struct pt_regs *regs) +__visible void smp_error_interrupt(struct pt_regs *regs) { entering_irq(); __smp_error_interrupt(regs); exiting_irq(); } -void smp_trace_error_interrupt(struct pt_regs *regs) +__visible void smp_trace_error_interrupt(struct pt_regs *regs) { entering_irq(); trace_error_apic_entry(ERROR_APIC_VECTOR); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9ed796ccc32..e63a5bd2a78 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1534,6 +1534,11 @@ void intel_ir_io_apic_print_entries(unsigned int apic, } } +void ioapic_zap_locks(void) +{ + raw_spin_lock_init(&ioapic_lock); +} + __apicdebuginit(void) print_IO_APIC(int ioapic_idx) { union IO_APIC_reg_00 reg_00; @@ -3375,12 +3380,15 @@ int io_apic_setup_irq_pin_once(unsigned int irq, int node, { unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin; int ret; + struct IO_APIC_route_entry orig_entry; /* Avoid redundant programming */ if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mpc_ioapic_id(ioapic_idx), pin); - return 0; + pr_debug("Pin %d-%d already programmed\n", mpc_ioapic_id(ioapic_idx), pin); + orig_entry = ioapic_read_entry(attr->ioapic, pin); + if (attr->trigger == orig_entry.trigger && attr->polarity == orig_entry.polarity) + return 0; + return -EBUSY; } ret = io_apic_setup_irq_pin(irq, node, attr); if (!ret) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 1191ac1c9d2..a419814cea5 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -113,7 +113,7 @@ static int __init early_get_pnodeid(void) break; case UV3_HUB_PART_NUMBER: case UV3_HUB_PART_NUMBER_X: - uv_min_hub_revision_id += UV3_HUB_REVISION_BASE - 1; + uv_min_hub_revision_id += UV3_HUB_REVISION_BASE; break; } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 53a4e274484..3ab03430211 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -392,7 +392,7 @@ static struct cpuidle_device apm_cpuidle_device; /* * Local variables */ -static struct { +__visible struct { unsigned long offset; unsigned short segment; } apm_bios_entry; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 28610822fb3..9f6b9341950 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -32,7 +32,6 @@ void common(void) { OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); OFFSET(TI_addr_limit, thread_info, addr_limit); - OFFSET(TI_preempt_count, thread_info, preempt_count); BLANK(); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 08a089043cc..3daece79a14 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -66,8 +66,8 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) * performance at the same time.. */ -extern void vide(void); -__asm__(".align 4\nvide: ret"); +extern __visible void vide(void); +__asm__(".globl vide\n\t.align 4\nvide: ret"); static void init_amd_k5(struct cpuinfo_x86 *c) { @@ -823,8 +823,8 @@ static const struct cpu_dev amd_cpu_dev = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, #ifdef CONFIG_X86_32 - .c_models = { - { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = + .legacy_models = { + { .family = 4, .model_names = { [3] = "486 DX/2", [7] = "486 DX/2-WB", @@ -835,7 +835,7 @@ static const struct cpu_dev amd_cpu_dev = { } }, }, - .c_size_cache = amd_size_cache, + .legacy_cache_size = amd_size_cache, #endif .c_early_init = early_init_amd, .c_detect_tlb = cpu_detect_tlb_amd, diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index fbf6c3bc240..8d5652dc99d 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -468,10 +468,10 @@ static void init_centaur(struct cpuinfo_x86 *c) #endif } +#ifdef CONFIG_X86_32 static unsigned int centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) { -#ifdef CONFIG_X86_32 /* VIA C3 CPUs (670-68F) need further shifting. */ if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) size >>= 8; @@ -484,16 +484,18 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) if ((c->x86 == 6) && (c->x86_model == 9) && (c->x86_mask == 1) && (size == 65)) size -= 1; -#endif return size; } +#endif static const struct cpu_dev centaur_cpu_dev = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, .c_early_init = early_init_centaur, .c_init = init_centaur, - .c_size_cache = centaur_size_cache, +#ifdef CONFIG_X86_32 + .legacy_cache_size = centaur_size_cache, +#endif .c_x86_vendor = X86_VENDOR_CENTAUR, }; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 25eb2747b06..6abc172b825 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -346,7 +346,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) /* Look up CPU names by table lookup. */ static const char *table_lookup_model(struct cpuinfo_x86 *c) { - const struct cpu_model_info *info; +#ifdef CONFIG_X86_32 + const struct legacy_cpu_model_info *info; if (c->x86_model >= 16) return NULL; /* Range check */ @@ -354,13 +355,14 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) if (!this_cpu) return NULL; - info = this_cpu->c_models; + info = this_cpu->legacy_models; - while (info && info->family) { + while (info->family) { if (info->family == c->x86) return info->model_names[c->x86_model]; info++; } +#endif return NULL; /* Not found */ } @@ -450,8 +452,8 @@ void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); #else /* do processor-specific cache resizing */ - if (this_cpu->c_size_cache) - l2size = this_cpu->c_size_cache(c, l2size); + if (this_cpu->legacy_cache_size) + l2size = this_cpu->legacy_cache_size(c, l2size); /* Allow user to override all this if necessary. */ if (cachesize_override != -1) @@ -1076,7 +1078,7 @@ struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) debug_idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, - irq_stack_union) __aligned(PAGE_SIZE); + irq_stack_union) __aligned(PAGE_SIZE) __visible; /* * The following four percpu variables are hot. Align current_task to @@ -1093,7 +1095,10 @@ EXPORT_PER_CPU_SYMBOL(kernel_stack); DEFINE_PER_CPU(char *, irq_stack_ptr) = init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; -DEFINE_PER_CPU(unsigned int, irq_count) = -1; +DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; + +DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); @@ -1169,6 +1174,8 @@ void debug_stack_reset(void) DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); +DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 4041c24ae7d..c37dc37e831 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -1,12 +1,6 @@ #ifndef ARCH_X86_CPU_H #define ARCH_X86_CPU_H -struct cpu_model_info { - int vendor; - int family; - const char *model_names[16]; -}; - /* attempt to consolidate cpu attributes */ struct cpu_dev { const char *c_vendor; @@ -14,15 +8,23 @@ struct cpu_dev { /* some have two possibilities for cpuid string */ const char *c_ident[2]; - struct cpu_model_info c_models[4]; - void (*c_early_init)(struct cpuinfo_x86 *); void (*c_bsp_init)(struct cpuinfo_x86 *); void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); void (*c_detect_tlb)(struct cpuinfo_x86 *); - unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int); int c_x86_vendor; +#ifdef CONFIG_X86_32 + /* Optional vendor specific routine to obtain the cache size. */ + unsigned int (*legacy_cache_size)(struct cpuinfo_x86 *, + unsigned int); + + /* Family/stepping-based lookup table for model names. */ + struct legacy_cpu_model_info { + int family; + const char *model_names[16]; + } legacy_models[5]; +#endif }; struct _tlb_table { diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 87279212d31..36ce402a3fa 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -25,11 +25,6 @@ #include <asm/processor.h> #include <asm/hypervisor.h> -/* - * Hypervisor detect order. This is specified explicitly here because - * some hypervisors might implement compatibility modes for other - * hypervisors and therefore need to be detected in specific sequence. - */ static const __initconst struct hypervisor_x86 * const hypervisors[] = { #ifdef CONFIG_XEN_PVHVM @@ -49,15 +44,19 @@ static inline void __init detect_hypervisor_vendor(void) { const struct hypervisor_x86 *h, * const *p; + uint32_t pri, max_pri = 0; for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { h = *p; - if (h->detect()) { + pri = h->detect(); + if (pri != 0 && pri > max_pri) { + max_pri = pri; x86_hyper = h; - printk(KERN_INFO "Hypervisor detected: %s\n", h->name); - break; } } + + if (max_pri) + printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name); } void init_hypervisor(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ec7299566f7..dc1ec0dff93 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -665,8 +665,8 @@ static const struct cpu_dev intel_cpu_dev = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, #ifdef CONFIG_X86_32 - .c_models = { - { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = + .legacy_models = { + { .family = 4, .model_names = { [0] = "486 DX-25/33", [1] = "486 DX-50", @@ -679,7 +679,7 @@ static const struct cpu_dev intel_cpu_dev = { [9] = "486 DX/4-WB" } }, - { .vendor = X86_VENDOR_INTEL, .family = 5, .model_names = + { .family = 5, .model_names = { [0] = "Pentium 60/66 A-step", [1] = "Pentium 60/66", @@ -690,7 +690,7 @@ static const struct cpu_dev intel_cpu_dev = { [8] = "Mobile Pentium MMX" } }, - { .vendor = X86_VENDOR_INTEL, .family = 6, .model_names = + { .family = 6, .model_names = { [0] = "Pentium Pro A-step", [1] = "Pentium Pro", @@ -704,7 +704,7 @@ static const struct cpu_dev intel_cpu_dev = { [11] = "Pentium III (Tualatin)", } }, - { .vendor = X86_VENDOR_INTEL, .family = 15, .model_names = + { .family = 15, .model_names = { [0] = "Pentium 4 (Unknown)", [1] = "Pentium 4 (Willamette)", @@ -714,7 +714,7 @@ static const struct cpu_dev intel_cpu_dev = { } }, }, - .c_size_cache = intel_size_cache, + .legacy_cache_size = intel_size_cache, #endif .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index cd8b166a173..de8b60a53f6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -42,8 +42,7 @@ void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) struct mce m; /* Only corrected MC is reported */ - if (!corrected || !(mem_err->validation_bits & - CPER_MEM_VALID_PHYSICAL_ADDRESS)) + if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA)) return; mce_setup(&m); diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 5b7d4fa5d3b..09edd0b65fe 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -25,15 +25,18 @@ int mce_severity(struct mce *a, int tolerant, char **msg); struct dentry *mce_get_debugfs_dir(void); extern struct mce_bank *mce_banks; +extern mce_banks_t mce_banks_ce_disabled; #ifdef CONFIG_X86_MCE_INTEL unsigned long mce_intel_adjust_timer(unsigned long interval); void mce_intel_cmci_poll(void); void mce_intel_hcpu_update(unsigned long cpu); +void cmci_disable_bank(int bank); #else # define mce_intel_adjust_timer mce_adjust_timer_default static inline void mce_intel_cmci_poll(void) { } static inline void mce_intel_hcpu_update(unsigned long cpu) { } +static inline void cmci_disable_bank(int bank) { } #endif void mce_timer_kick(unsigned long interval); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 87a65c939bc..b3218cdee95 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -97,6 +97,15 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; +/* + * MCA banks controlled through firmware first for corrected errors. + * This is a global list of banks for which we won't enable CMCI and we + * won't poll. Firmware controls these banks and is responsible for + * reporting corrected errors through GHES. Uncorrected/recoverable + * errors are still notified through a machine check. + */ +mce_banks_t mce_banks_ce_disabled; + static DEFINE_PER_CPU(struct work_struct, mce_work); static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); @@ -1935,6 +1944,25 @@ static struct miscdevice mce_chrdev_device = { &mce_chrdev_ops, }; +static void __mce_disable_bank(void *arg) +{ + int bank = *((int *)arg); + __clear_bit(bank, __get_cpu_var(mce_poll_banks)); + cmci_disable_bank(bank); +} + +void mce_disable_bank(int bank) +{ + if (bank >= mca_cfg.banks) { + pr_warn(FW_BUG + "Ignoring request to disable invalid MCA bank %d.\n", + bank); + return; + } + set_bit(bank, mce_banks_ce_disabled); + on_each_cpu(__mce_disable_bank, &bank, 1); +} + /* * mce=off Disables machine check * mce=no_cmci Disables CMCI diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index d56405309dc..4cfe0458ca6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -203,6 +203,10 @@ static void cmci_discover(int banks) if (test_bit(i, owned)) continue; + /* Skip banks in firmware first mode */ + if (test_bit(i, mce_banks_ce_disabled)) + continue; + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ @@ -271,6 +275,19 @@ void cmci_recheck(void) local_irq_restore(flags); } +/* Caller must hold the lock on cmci_discover_lock */ +static void __cmci_disable_bank(int bank) +{ + u64 val; + + if (!test_bit(bank, __get_cpu_var(mce_banks_owned))) + return; + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + __clear_bit(bank, __get_cpu_var(mce_banks_owned)); +} + /* * Disable CMCI on this CPU for all banks it owns when it goes down. * This allows other CPUs to claim the banks on rediscovery. @@ -280,20 +297,12 @@ void cmci_clear(void) unsigned long flags; int i; int banks; - u64 val; if (!cmci_supported(&banks)) return; raw_spin_lock_irqsave(&cmci_discover_lock, flags); - for (i = 0; i < banks; i++) { - if (!test_bit(i, __get_cpu_var(mce_banks_owned))) - continue; - /* Disable CMCI */ - rdmsrl(MSR_IA32_MCx_CTL2(i), val); - val &= ~MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(i), val); - __clear_bit(i, __get_cpu_var(mce_banks_owned)); - } + for (i = 0; i < banks; i++) + __cmci_disable_bank(i); raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } @@ -327,6 +336,19 @@ void cmci_reenable(void) cmci_discover(banks); } +void cmci_disable_bank(int bank) +{ + int banks; + unsigned long flags; + + if (!cmci_supported(&banks)) + return; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + __cmci_disable_bank(bank); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + static void intel_init_cmci(void) { int banks; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 8f4be53ea04..9f7ca266864 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -15,6 +15,7 @@ #include <linux/clocksource.h> #include <linux/module.h> #include <linux/hardirq.h> +#include <linux/efi.h> #include <linux/interrupt.h> #include <asm/processor.h> #include <asm/hypervisor.h> @@ -23,24 +24,29 @@ #include <asm/desc.h> #include <asm/idle.h> #include <asm/irq_regs.h> +#include <asm/i8259.h> +#include <asm/apic.h> struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); -static bool __init ms_hyperv_platform(void) +static uint32_t __init ms_hyperv_platform(void) { u32 eax; u32 hyp_signature[3]; if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return false; + return 0; cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); - return eax >= HYPERV_CPUID_MIN && - eax <= HYPERV_CPUID_MAX && - !memcmp("Microsoft Hv", hyp_signature, 12); + if (eax >= HYPERV_CPUID_MIN && + eax <= HYPERV_CPUID_MAX && + !memcmp("Microsoft Hv", hyp_signature, 12)) + return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; + + return 0; } static cycle_t read_hv_clock(struct clocksource *arg) @@ -73,6 +79,30 @@ static void __init ms_hyperv_init_platform(void) printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); +#ifdef CONFIG_X86_LOCAL_APIC + if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) { + /* + * Get the APIC frequency. + */ + u64 hv_lapic_frequency; + + rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); + hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); + lapic_timer_frequency = hv_lapic_frequency; + printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n", + lapic_timer_frequency); + + /* + * On Hyper-V, when we are booting off an EFI firmware stack, + * we do not have many legacy devices including PIC, PIT etc. + */ + if (efi_enabled(EFI_BOOT)) { + printk(KERN_INFO "HyperV: Using null_legacy_pic\n"); + legacy_pic = &null_legacy_pic; + } + } +#endif + if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index d4cdfa67509..ce2d0a2c3e4 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -683,6 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) } /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Save MTRR state */ @@ -696,6 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) static void post_set(void) __releases(set_atomicity_lock) { /* Flush TLBs (no need to flush caches - they are disabled) */ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Intel (P6) standard MTRRs */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index a7c7305030c..8e132931614 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1276,16 +1276,16 @@ void perf_events_lapic_init(void) static int __kprobes perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { - int ret; u64 start_clock; u64 finish_clock; + int ret; if (!atomic_read(&active_events)) return NMI_DONE; - start_clock = local_clock(); + start_clock = sched_clock(); ret = x86_pmu.handle_irq(regs); - finish_clock = local_clock(); + finish_clock = sched_clock(); perf_sample_event_took(finish_clock - start_clock); @@ -1506,7 +1506,7 @@ static int __init init_hw_perf_events(void) err = amd_pmu_init(); break; default: - return 0; + err = -ENOTSUPP; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); @@ -1883,20 +1883,21 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { - userpg->cap_usr_time = 0; - userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc; + userpg->cap_user_time = 0; + userpg->cap_user_time_zero = 0; + userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; userpg->pmc_width = x86_pmu.cntval_bits; - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + if (!sched_clock_stable) return; - if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) - return; - - userpg->cap_usr_time = 1; + userpg->cap_user_time = 1; userpg->time_mult = this_cpu_read(cyc2ns); userpg->time_shift = CYC2NS_SCALE_FACTOR; userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; + + userpg->cap_user_time_zero = 1; + userpg->time_zero = this_cpu_read(cyc2ns_offset); } /* @@ -1988,7 +1989,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) frame.return_address = 0; bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); - if (bytes != sizeof(frame)) + if (bytes != 0) break; if (!valid_user_frame(fp, sizeof(frame))) @@ -2040,7 +2041,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) frame.return_address = 0; bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); - if (bytes != sizeof(frame)) + if (bytes != 0) break; if (!valid_user_frame(fp, sizeof(frame))) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 97e557bc4c9..fd00bb29425 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -164,6 +164,11 @@ struct cpu_hw_events { struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX]; /* + * Intel checkpoint mask + */ + u64 intel_cp_status; + + /* * manage shared (per-core, per-cpu) registers * used on Intel NHM/WSM/SNB */ @@ -440,6 +445,7 @@ struct x86_pmu { int lbr_nr; /* hardware stack size */ u64 lbr_sel_mask; /* LBR_SELECT valid bits */ const int *lbr_sel_map; /* lbr_select mappings */ + bool lbr_double_abort; /* duplicated lbr aborts */ /* * Extra registers for events @@ -641,6 +647,8 @@ extern struct event_constraint intel_core2_pebs_event_constraints[]; extern struct event_constraint intel_atom_pebs_event_constraints[]; +extern struct event_constraint intel_slm_pebs_event_constraints[]; + extern struct event_constraint intel_nehalem_pebs_event_constraints[]; extern struct event_constraint intel_westmere_pebs_event_constraints[]; diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 4cbe03287b0..beeb7cc0704 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -347,8 +347,7 @@ static struct amd_nb *amd_alloc_nb(int cpu) struct amd_nb *nb; int i; - nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO, - cpu_to_node(cpu)); + nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu)); if (!nb) return NULL; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index a45d8d4ace1..0fa4f242f05 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -81,7 +81,8 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), EVENT_EXTRA_END }; @@ -123,6 +124,7 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly = INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */ INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */ + INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */ INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */ @@ -143,8 +145,9 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly = static struct extra_reg intel_westmere_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), EVENT_EXTRA_END }; @@ -162,23 +165,34 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +static struct event_constraint intel_slm_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ + EVENT_CONSTRAINT_END +}; + static struct extra_reg intel_snb_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1), + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), EVENT_EXTRA_END }; static struct extra_reg intel_snbep_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), EVENT_EXTRA_END }; -EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); -EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); -EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); +EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); +EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); +EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); struct attribute *nhm_events_attrs[] = { EVENT_PTR(mem_ld_nhm), @@ -882,6 +896,140 @@ static __initconst const u64 atom_hw_cache_event_ids }, }; +static struct extra_reg intel_slm_extra_regs[] __read_mostly = +{ + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1), + EVENT_EXTRA_END +}; + +#define SLM_DMND_READ SNB_DMND_DATA_RD +#define SLM_DMND_WRITE SNB_DMND_RFO +#define SLM_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) + +#define SLM_SNP_ANY (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM) +#define SLM_LLC_ACCESS SNB_RESP_ANY +#define SLM_LLC_MISS (SLM_SNP_ANY|SNB_NON_DRAM) + +static __initconst const u64 slm_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_READ|SLM_LLC_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_WRITE|SLM_LLC_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_PREFETCH|SLM_LLC_MISS, + }, + }, +}; + +static __initconst const u64 slm_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0104, /* LD_DCU_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */ + [ C(RESULT_MISS) ] = 0x0280, /* ICACGE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0804, /* LD_DTLB_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) { /* user explicitly requested branch sampling */ @@ -1036,6 +1184,11 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) wrmsrl(hwc->config_base, ctrl_val); } +static inline bool event_is_checkpointed(struct perf_event *event) +{ + return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0; +} + static void intel_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -1049,6 +1202,7 @@ static void intel_pmu_disable_event(struct perf_event *event) cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); + cpuc->intel_cp_status &= ~(1ull << hwc->idx); /* * must disable before any actual event @@ -1123,6 +1277,9 @@ static void intel_pmu_enable_event(struct perf_event *event) if (event->attr.exclude_guest) cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx); + if (unlikely(event_is_checkpointed(event))) + cpuc->intel_cp_status |= (1ull << hwc->idx); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_enable_fixed(hwc); return; @@ -1141,6 +1298,17 @@ static void intel_pmu_enable_event(struct perf_event *event) int intel_pmu_save_and_restart(struct perf_event *event) { x86_perf_event_update(event); + /* + * For a checkpointed counter always reset back to 0. This + * avoids a situation where the counter overflows, aborts the + * transaction and is then set back to shortly before the + * overflow, and overflows and aborts again. + */ + if (unlikely(event_is_checkpointed(event))) { + /* No race with NMIs because the counter should not be armed */ + wrmsrl(event->hw.event_base, 0); + local64_set(&event->hw.prev_count, 0); + } return x86_perf_event_set_period(event); } @@ -1224,6 +1392,13 @@ again: x86_pmu.drain_pebs(regs); } + /* + * Checkpointed counters can lead to 'spurious' PMIs because the + * rollback caused by the PMI will have cleared the overflow status + * bit. Therefore always force probe these counters. + */ + status |= cpuc->intel_cp_status; + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; @@ -1301,11 +1476,11 @@ static void intel_fixup_er(struct perf_event *event, int idx) if (idx == EXTRA_REG_RSP_0) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= 0x01b7; + event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; } else if (idx == EXTRA_REG_RSP_1) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= 0x01bb; + event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; } } @@ -1689,6 +1864,20 @@ static int hsw_hw_config(struct perf_event *event) event->attr.precise_ip > 0)) return -EOPNOTSUPP; + if (event_is_checkpointed(event)) { + /* + * Sampling of checkpointed events can cause situations where + * the CPU constantly aborts because of a overflow, which is + * then checkpointed back and ignored. Forbid checkpointing + * for sampling. + * + * But still allow a long sampling period, so that perf stat + * from KVM works. + */ + if (event->attr.sample_period > 0 && + event->attr.sample_period < 0x7fffffff) + return -EOPNOTSUPP; + } return 0; } @@ -2034,10 +2223,36 @@ static __init void intel_nehalem_quirk(void) } } -EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3"); -EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82") +EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3"); +EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82") + +/* Haswell special events */ +EVENT_ATTR_STR(tx-start, tx_start, "event=0xc9,umask=0x1"); +EVENT_ATTR_STR(tx-commit, tx_commit, "event=0xc9,umask=0x2"); +EVENT_ATTR_STR(tx-abort, tx_abort, "event=0xc9,umask=0x4"); +EVENT_ATTR_STR(tx-capacity, tx_capacity, "event=0x54,umask=0x2"); +EVENT_ATTR_STR(tx-conflict, tx_conflict, "event=0x54,umask=0x1"); +EVENT_ATTR_STR(el-start, el_start, "event=0xc8,umask=0x1"); +EVENT_ATTR_STR(el-commit, el_commit, "event=0xc8,umask=0x2"); +EVENT_ATTR_STR(el-abort, el_abort, "event=0xc8,umask=0x4"); +EVENT_ATTR_STR(el-capacity, el_capacity, "event=0x54,umask=0x2"); +EVENT_ATTR_STR(el-conflict, el_conflict, "event=0x54,umask=0x1"); +EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1"); +EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1"); static struct attribute *hsw_events_attrs[] = { + EVENT_PTR(tx_start), + EVENT_PTR(tx_commit), + EVENT_PTR(tx_abort), + EVENT_PTR(tx_capacity), + EVENT_PTR(tx_conflict), + EVENT_PTR(el_start), + EVENT_PTR(el_commit), + EVENT_PTR(el_abort), + EVENT_PTR(el_capacity), + EVENT_PTR(el_conflict), + EVENT_PTR(cycles_t), + EVENT_PTR(cycles_ct), EVENT_PTR(mem_ld_hsw), EVENT_PTR(mem_st_hsw), NULL @@ -2176,6 +2391,22 @@ __init int intel_pmu_init(void) pr_cont("Atom events, "); break; + case 55: /* Atom 22nm "Silvermont" */ + case 77: /* Avoton "Silvermont" */ + memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_atom(); + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; + x86_pmu.extra_regs = intel_slm_extra_regs; + x86_pmu.er_flags |= ERF_HAS_RSP_1; + pr_cont("Silvermont events, "); + break; + case 37: /* 32 nm nehalem, "Clarkdale" */ case 44: /* 32 nm nehalem, "Gulftown" */ case 47: /* 32 nm Xeon E7 */ @@ -2288,6 +2519,7 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; x86_pmu.cpu_events = hsw_events_attrs; + x86_pmu.lbr_double_abort = true; pr_cont("Haswell events, "); break; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 3065c57a63c..ae96cfa5edd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -12,6 +12,7 @@ #define BTS_BUFFER_SIZE (PAGE_SIZE << 4) #define PEBS_BUFFER_SIZE PAGE_SIZE +#define PEBS_FIXUP_SIZE PAGE_SIZE /* * pebs_record_32 for p4 and core not supported @@ -182,18 +183,32 @@ struct pebs_record_nhm { * Same as pebs_record_nhm, with two additional fields. */ struct pebs_record_hsw { - struct pebs_record_nhm nhm; - /* - * Real IP of the event. In the Intel documentation this - * is called eventingrip. - */ - u64 real_ip; - /* - * TSX tuning information field: abort cycles and abort flags. - */ - u64 tsx_tuning; + u64 flags, ip; + u64 ax, bx, cx, dx; + u64 si, di, bp, sp; + u64 r8, r9, r10, r11; + u64 r12, r13, r14, r15; + u64 status, dla, dse, lat; + u64 real_ip, tsx_tuning; +}; + +union hsw_tsx_tuning { + struct { + u32 cycles_last_block : 32, + hle_abort : 1, + rtm_abort : 1, + instruction_abort : 1, + non_instruction_abort : 1, + retry : 1, + data_conflict : 1, + capacity_writes : 1, + capacity_reads : 1; + }; + u64 value; }; +#define PEBS_HSW_TSX_FLAGS 0xff00000000ULL + void init_debug_store_on_cpu(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -214,20 +229,35 @@ void fini_debug_store_on_cpu(int cpu) wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); } +static DEFINE_PER_CPU(void *, insn_buffer); + static int alloc_pebs_buffer(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; int node = cpu_to_node(cpu); int max, thresh = 1; /* always use a single PEBS record */ - void *buffer; + void *buffer, *ibuffer; if (!x86_pmu.pebs) return 0; - buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; + /* + * HSW+ already provides us the eventing ip; no need to allocate this + * buffer then. + */ + if (x86_pmu.intel_cap.pebs_format < 2) { + ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); + if (!ibuffer) { + kfree(buffer); + return -ENOMEM; + } + per_cpu(insn_buffer, cpu) = ibuffer; + } + max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; ds->pebs_buffer_base = (u64)(unsigned long)buffer; @@ -248,6 +278,9 @@ static void release_pebs_buffer(int cpu) if (!ds || !x86_pmu.pebs) return; + kfree(per_cpu(insn_buffer, cpu)); + per_cpu(insn_buffer, cpu) = NULL; + kfree((void *)(unsigned long)ds->pebs_buffer_base); ds->pebs_buffer_base = 0; } @@ -262,7 +295,7 @@ static int alloc_bts_buffer(int cpu) if (!x86_pmu.bts) return 0; - buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -295,7 +328,7 @@ static int alloc_ds_buffer(int cpu) int node = cpu_to_node(cpu); struct debug_store *ds; - ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node); + ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); if (unlikely(!ds)) return -ENOMEM; @@ -517,6 +550,32 @@ struct event_constraint intel_atom_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_slm_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x0103, 0x1), /* REHABQ.LD_BLOCK_ST_FORWARD_PS */ + INTEL_UEVENT_CONSTRAINT(0x0803, 0x1), /* REHABQ.LD_SPLITS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0204, 0x1), /* MEM_UOPS_RETIRED.L2_HIT_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0404, 0x1), /* MEM_UOPS_RETIRED.L2_MISS_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0804, 0x1), /* MEM_UOPS_RETIRED.DTLB_MISS_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x2004, 0x1), /* MEM_UOPS_RETIRED.HITM_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c4, 0x1), /* BR_INST_RETIRED.ALL_BRANCHES_PS */ + INTEL_UEVENT_CONSTRAINT(0x7ec4, 0x1), /* BR_INST_RETIRED.JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0xbfc4, 0x1), /* BR_INST_RETIRED.FAR_BRANCH_PS */ + INTEL_UEVENT_CONSTRAINT(0xebc4, 0x1), /* BR_INST_RETIRED.NON_RETURN_IND_PS */ + INTEL_UEVENT_CONSTRAINT(0xf7c4, 0x1), /* BR_INST_RETIRED.RETURN_PS */ + INTEL_UEVENT_CONSTRAINT(0xf9c4, 0x1), /* BR_INST_RETIRED.CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfbc4, 0x1), /* BR_INST_RETIRED.IND_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfdc4, 0x1), /* BR_INST_RETIRED.REL_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfec4, 0x1), /* BR_INST_RETIRED.TAKEN_JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_MISP_RETIRED.ALL_BRANCHES_PS */ + INTEL_UEVENT_CONSTRAINT(0x7ec5, 0x1), /* BR_INST_MISP_RETIRED.JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0xebc5, 0x1), /* BR_INST_MISP_RETIRED.NON_RETURN_IND_PS */ + INTEL_UEVENT_CONSTRAINT(0xf7c5, 0x1), /* BR_INST_MISP_RETIRED.RETURN_PS */ + INTEL_UEVENT_CONSTRAINT(0xfbc5, 0x1), /* BR_INST_MISP_RETIRED.IND_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfec5, 0x1), /* BR_INST_MISP_RETIRED.TAKEN_JCC_PS */ + EVENT_CONSTRAINT_END +}; + struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ @@ -558,6 +617,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ EVENT_CONSTRAINT_END }; @@ -688,6 +748,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) unsigned long old_to, to = cpuc->lbr_entries[0].to; unsigned long ip = regs->ip; int is_64bit = 0; + void *kaddr; /* * We don't need to fixup if the PEBS assist is fault like @@ -711,7 +772,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) * unsigned math, either ip is before the start (impossible) or * the basic block is larger than 1 page (sanity) */ - if ((ip - to) > PAGE_SIZE) + if ((ip - to) > PEBS_FIXUP_SIZE) return 0; /* @@ -722,29 +783,33 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 1; } + if (!kernel_ip(ip)) { + int size, bytes; + u8 *buf = this_cpu_read(insn_buffer); + + size = ip - to; /* Must fit our buffer, see above */ + bytes = copy_from_user_nmi(buf, (void __user *)to, size); + if (bytes != 0) + return 0; + + kaddr = buf; + } else { + kaddr = (void *)to; + } + do { struct insn insn; - u8 buf[MAX_INSN_SIZE]; - void *kaddr; old_to = to; - if (!kernel_ip(ip)) { - int bytes, size = MAX_INSN_SIZE; - - bytes = copy_from_user_nmi(buf, (void __user *)to, size); - if (bytes != size) - return 0; - - kaddr = buf; - } else - kaddr = (void *)to; #ifdef CONFIG_X86_64 is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); #endif insn_init(&insn, kaddr, is_64bit); insn_get_length(&insn); + to += insn.length; + kaddr += insn.length; } while (to < ip); if (to == ip) { @@ -759,16 +824,34 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 0; } +static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs) +{ + if (pebs->tsx_tuning) { + union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning }; + return tsx.cycles_last_block; + } + return 0; +} + +static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs) +{ + u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; + + /* For RTM XABORTs also log the abort code from AX */ + if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1)) + txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT; + return txn; +} + static void __intel_pmu_pebs_event(struct perf_event *event, struct pt_regs *iregs, void *__pebs) { /* - * We cast to pebs_record_nhm to get the load latency data - * if extra_reg MSR_PEBS_LD_LAT_THRESHOLD used + * We cast to the biggest pebs_record but are careful not to + * unconditionally access the 'extra' entries. */ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct pebs_record_nhm *pebs = __pebs; - struct pebs_record_hsw *pebs_hsw = __pebs; + struct pebs_record_hsw *pebs = __pebs; struct perf_sample_data data; struct pt_regs regs; u64 sample_type; @@ -827,7 +910,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, regs.sp = pebs->sp; if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { - regs.ip = pebs_hsw->real_ip; + regs.ip = pebs->real_ip; regs.flags |= PERF_EFLAGS_EXACT; } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(®s)) regs.flags |= PERF_EFLAGS_EXACT; @@ -835,9 +918,18 @@ static void __intel_pmu_pebs_event(struct perf_event *event, regs.flags &= ~PERF_EFLAGS_EXACT; if ((event->attr.sample_type & PERF_SAMPLE_ADDR) && - x86_pmu.intel_cap.pebs_format >= 1) + x86_pmu.intel_cap.pebs_format >= 1) data.addr = pebs->dla; + if (x86_pmu.intel_cap.pebs_format >= 2) { + /* Only set the TSX weight when no memory weight. */ + if ((event->attr.sample_type & PERF_SAMPLE_WEIGHT) && !fll) + data.weight = intel_hsw_weight(pebs); + + if (event->attr.sample_type & PERF_SAMPLE_TRANSACTION) + data.txn = intel_hsw_transaction(pebs); + } + if (has_branch_stack(event)) data.br_stack = &cpuc->lbr_stack; @@ -886,17 +978,34 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) __intel_pmu_pebs_event(event, iregs, at); } -static void __intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, void *at, - void *top) +static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct debug_store *ds = cpuc->ds; struct perf_event *event = NULL; + void *at, *top; u64 status = 0; int bit; + if (!x86_pmu.pebs_active) + return; + + at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; + top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; + ds->pebs_index = ds->pebs_buffer_base; + if (unlikely(at > top)) + return; + + /* + * Should not happen, we program the threshold at 1 and do not + * set a reset value. + */ + WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size, + "Unexpected number of pebs records %ld\n", + (long)(top - at) / x86_pmu.pebs_record_size); + for (; at < top; at += x86_pmu.pebs_record_size) { struct pebs_record_nhm *p = at; @@ -924,61 +1033,6 @@ static void __intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, void *at, } } -static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct debug_store *ds = cpuc->ds; - struct pebs_record_nhm *at, *top; - int n; - - if (!x86_pmu.pebs_active) - return; - - at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; - top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; - - ds->pebs_index = ds->pebs_buffer_base; - - n = top - at; - if (n <= 0) - return; - - /* - * Should not happen, we program the threshold at 1 and do not - * set a reset value. - */ - WARN_ONCE(n > x86_pmu.max_pebs_events, - "Unexpected number of pebs records %d\n", n); - - return __intel_pmu_drain_pebs_nhm(iregs, at, top); -} - -static void intel_pmu_drain_pebs_hsw(struct pt_regs *iregs) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct debug_store *ds = cpuc->ds; - struct pebs_record_hsw *at, *top; - int n; - - if (!x86_pmu.pebs_active) - return; - - at = (struct pebs_record_hsw *)(unsigned long)ds->pebs_buffer_base; - top = (struct pebs_record_hsw *)(unsigned long)ds->pebs_index; - - n = top - at; - if (n <= 0) - return; - /* - * Should not happen, we program the threshold at 1 and do not - * set a reset value. - */ - WARN_ONCE(n > x86_pmu.max_pebs_events, - "Unexpected number of pebs records %d\n", n); - - return __intel_pmu_drain_pebs_nhm(iregs, at, top); -} - /* * BTS, PEBS probe and setup */ @@ -1013,7 +1067,7 @@ void intel_ds_init(void) case 2: pr_cont("PEBS fmt2%c, ", pebs_type); x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw); - x86_pmu.drain_pebs = intel_pmu_drain_pebs_hsw; + x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; break; default: diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index d5be06a5005..d82d155aca8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -284,6 +284,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) int lbr_format = x86_pmu.intel_cap.lbr_format; u64 tos = intel_pmu_lbr_tos(); int i; + int out = 0; for (i = 0; i < x86_pmu.lbr_nr; i++) { unsigned long lbr_idx = (tos - i) & mask; @@ -306,15 +307,27 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) } from = (u64)((((s64)from) << skip) >> skip); - cpuc->lbr_entries[i].from = from; - cpuc->lbr_entries[i].to = to; - cpuc->lbr_entries[i].mispred = mis; - cpuc->lbr_entries[i].predicted = pred; - cpuc->lbr_entries[i].in_tx = in_tx; - cpuc->lbr_entries[i].abort = abort; - cpuc->lbr_entries[i].reserved = 0; + /* + * Some CPUs report duplicated abort records, + * with the second entry not having an abort bit set. + * Skip them here. This loop runs backwards, + * so we need to undo the previous record. + * If the abort just happened outside the window + * the extra entry cannot be removed. + */ + if (abort && x86_pmu.lbr_double_abort && out > 0) + out--; + + cpuc->lbr_entries[out].from = from; + cpuc->lbr_entries[out].to = to; + cpuc->lbr_entries[out].mispred = mis; + cpuc->lbr_entries[out].predicted = pred; + cpuc->lbr_entries[out].in_tx = in_tx; + cpuc->lbr_entries[out].abort = abort; + cpuc->lbr_entries[out].reserved = 0; + out++; } - cpuc->lbr_stack.nr = i; + cpuc->lbr_stack.nr = out; } void intel_pmu_lbr_read(void) @@ -478,7 +491,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) /* may fail if text not present */ bytes = copy_from_user_nmi(buf, (void __user *)from, size); - if (bytes != size) + if (bytes != 0) return X86_BR_NONE; addr = buf; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 1fb6c72717b..29c248799ce 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -6,6 +6,8 @@ static struct intel_uncore_type **pci_uncores = empty_uncore; /* pci bus to socket mapping */ static int pcibus_to_physid[256] = { [0 ... 255] = -1, }; +static struct pci_dev *extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; + static DEFINE_RAW_SPINLOCK(uncore_box_lock); /* mask of cpus that collect uncore events */ @@ -45,6 +47,24 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7"); DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15"); DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23"); DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51"); +DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35"); +DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31"); +DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17"); +DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12"); +DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4"); +DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31"); +DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17"); +DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12"); +DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4"); +DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63"); static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) { @@ -281,7 +301,7 @@ static struct attribute *snbep_uncore_cbox_formats_attr[] = { }; static struct attribute *snbep_uncore_pcu_formats_attr[] = { - &format_attr_event.attr, + &format_attr_event_ext.attr, &format_attr_occ_sel.attr, &format_attr_edge.attr, &format_attr_inv.attr, @@ -301,6 +321,24 @@ static struct attribute *snbep_uncore_qpi_formats_attr[] = { &format_attr_edge.attr, &format_attr_inv.attr, &format_attr_thresh8.attr, + &format_attr_match_rds.attr, + &format_attr_match_rnid30.attr, + &format_attr_match_rnid4.attr, + &format_attr_match_dnid.attr, + &format_attr_match_mc.attr, + &format_attr_match_opc.attr, + &format_attr_match_vnw.attr, + &format_attr_match0.attr, + &format_attr_match1.attr, + &format_attr_mask_rds.attr, + &format_attr_mask_rnid30.attr, + &format_attr_mask_rnid4.attr, + &format_attr_mask_dnid.attr, + &format_attr_mask_mc.attr, + &format_attr_mask_opc.attr, + &format_attr_mask_vnw.attr, + &format_attr_mask0.attr, + &format_attr_mask1.attr, NULL, }; @@ -356,13 +394,16 @@ static struct intel_uncore_ops snbep_uncore_msr_ops = { SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), }; +#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \ + .init_box = snbep_uncore_pci_init_box, \ + .disable_box = snbep_uncore_pci_disable_box, \ + .enable_box = snbep_uncore_pci_enable_box, \ + .disable_event = snbep_uncore_pci_disable_event, \ + .read_counter = snbep_uncore_pci_read_counter + static struct intel_uncore_ops snbep_uncore_pci_ops = { - .init_box = snbep_uncore_pci_init_box, - .disable_box = snbep_uncore_pci_disable_box, - .enable_box = snbep_uncore_pci_enable_box, - .disable_event = snbep_uncore_pci_disable_event, - .enable_event = snbep_uncore_pci_enable_event, - .read_counter = snbep_uncore_pci_read_counter, + SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), + .enable_event = snbep_uncore_pci_enable_event, \ }; static struct event_constraint snbep_uncore_cbox_constraints[] = { @@ -726,6 +767,61 @@ static struct intel_uncore_type *snbep_msr_uncores[] = { NULL, }; +enum { + SNBEP_PCI_QPI_PORT0_FILTER, + SNBEP_PCI_QPI_PORT1_FILTER, +}; + +static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) { + reg1->idx = 0; + reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0; + reg1->config = event->attr.config1; + reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0; + reg2->config = event->attr.config2; + } + return 0; +} + +static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER; + struct pci_dev *filter_pdev = extra_pci_dev[box->phys_id][idx]; + WARN_ON_ONCE(!filter_pdev); + if (filter_pdev) { + pci_write_config_dword(filter_pdev, reg1->reg, + (u32)reg1->config); + pci_write_config_dword(filter_pdev, reg1->reg + 4, + (u32)(reg1->config >> 32)); + pci_write_config_dword(filter_pdev, reg2->reg, + (u32)reg2->config); + pci_write_config_dword(filter_pdev, reg2->reg + 4, + (u32)(reg2->config >> 32)); + } + } + + pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static struct intel_uncore_ops snbep_uncore_qpi_ops = { + SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), + .enable_event = snbep_qpi_enable_event, + .hw_config = snbep_qpi_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + #define SNBEP_UNCORE_PCI_COMMON_INIT() \ .perf_ctr = SNBEP_PCI_PMON_CTR0, \ .event_ctl = SNBEP_PCI_PMON_CTL0, \ @@ -755,17 +851,18 @@ static struct intel_uncore_type snbep_uncore_imc = { }; static struct intel_uncore_type snbep_uncore_qpi = { - .name = "qpi", - .num_counters = 4, - .num_boxes = 2, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCI_PMON_CTR0, - .event_ctl = SNBEP_PCI_PMON_CTL0, - .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, - .ops = &snbep_uncore_pci_ops, - .event_descs = snbep_uncore_qpi_events, - .format_group = &snbep_uncore_qpi_format_group, + .name = "qpi", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_qpi_ops, + .event_descs = snbep_uncore_qpi_events, + .format_group = &snbep_uncore_qpi_format_group, }; @@ -807,43 +904,53 @@ static struct intel_uncore_type *snbep_pci_uncores[] = { static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = { { /* Home Agent */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), - .driver_data = SNBEP_PCI_UNCORE_HA, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0), }, { /* MC Channel 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), - .driver_data = SNBEP_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0), }, { /* MC Channel 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), - .driver_data = SNBEP_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1), }, { /* MC Channel 2 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), - .driver_data = SNBEP_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2), }, { /* MC Channel 3 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), - .driver_data = SNBEP_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3), }, { /* QPI Port 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), - .driver_data = SNBEP_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0), }, { /* QPI Port 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), - .driver_data = SNBEP_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1), }, { /* R2PCIe */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), - .driver_data = SNBEP_PCI_UNCORE_R2PCIE, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0), }, { /* R3QPI Link 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), - .driver_data = SNBEP_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0), }, { /* R3QPI Link 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), - .driver_data = SNBEP_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT0_FILTER), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT1_FILTER), }, { /* end: all zeroes */ } }; @@ -890,6 +997,20 @@ static int snbep_pci2phy_map_init(int devid) } } + if (!err) { + /* + * For PCI bus with no UBOX device, find the next bus + * that has UBOX device and use its mapping. + */ + i = -1; + for (bus = 255; bus >= 0; bus--) { + if (pcibus_to_physid[bus] >= 0) + i = pcibus_to_physid[bus]; + else + pcibus_to_physid[bus] = i; + } + } + if (ubox_dev) pci_dev_put(ubox_dev); @@ -992,6 +1113,24 @@ static struct attribute *ivt_uncore_qpi_formats_attr[] = { &format_attr_umask.attr, &format_attr_edge.attr, &format_attr_thresh8.attr, + &format_attr_match_rds.attr, + &format_attr_match_rnid30.attr, + &format_attr_match_rnid4.attr, + &format_attr_match_dnid.attr, + &format_attr_match_mc.attr, + &format_attr_match_opc.attr, + &format_attr_match_vnw.attr, + &format_attr_match0.attr, + &format_attr_match1.attr, + &format_attr_mask_rds.attr, + &format_attr_mask_rnid30.attr, + &format_attr_mask_rnid4.attr, + &format_attr_mask_dnid.attr, + &format_attr_mask_mc.attr, + &format_attr_mask_opc.attr, + &format_attr_mask_vnw.attr, + &format_attr_mask0.attr, + &format_attr_mask1.attr, NULL, }; @@ -1205,17 +1344,83 @@ static struct intel_uncore_type ivt_uncore_imc = { IVT_UNCORE_PCI_COMMON_INIT(), }; +/* registers in IRP boxes are not properly aligned */ +static unsigned ivt_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4}; +static unsigned ivt_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0}; + +static void ivt_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], + hwc->config | SNBEP_PMON_CTL_EN); +} + +static void ivt_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], hwc->config); +} + +static u64 ivt_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count = 0; + + pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx], (u32 *)&count); + pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1); + + return count; +} + +static struct intel_uncore_ops ivt_uncore_irp_ops = { + .init_box = ivt_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = ivt_uncore_irp_disable_event, + .enable_event = ivt_uncore_irp_enable_event, + .read_counter = ivt_uncore_irp_read_counter, +}; + +static struct intel_uncore_type ivt_uncore_irp = { + .name = "irp", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_mask = IVT_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &ivt_uncore_irp_ops, + .format_group = &ivt_uncore_format_group, +}; + +static struct intel_uncore_ops ivt_uncore_qpi_ops = { + .init_box = ivt_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_qpi_enable_event, + .read_counter = snbep_uncore_pci_read_counter, + .hw_config = snbep_qpi_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + static struct intel_uncore_type ivt_uncore_qpi = { - .name = "qpi", - .num_counters = 4, - .num_boxes = 3, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCI_PMON_CTR0, - .event_ctl = SNBEP_PCI_PMON_CTL0, - .event_mask = IVT_QPI_PCI_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, - .ops = &ivt_uncore_pci_ops, - .format_group = &ivt_uncore_qpi_format_group, + .name = "qpi", + .num_counters = 4, + .num_boxes = 3, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = IVT_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &ivt_uncore_qpi_ops, + .format_group = &ivt_uncore_qpi_format_group, }; static struct intel_uncore_type ivt_uncore_r2pcie = { @@ -1239,6 +1444,7 @@ static struct intel_uncore_type ivt_uncore_r3qpi = { enum { IVT_PCI_UNCORE_HA, IVT_PCI_UNCORE_IMC, + IVT_PCI_UNCORE_IRP, IVT_PCI_UNCORE_QPI, IVT_PCI_UNCORE_R2PCIE, IVT_PCI_UNCORE_R3QPI, @@ -1247,6 +1453,7 @@ enum { static struct intel_uncore_type *ivt_pci_uncores[] = { [IVT_PCI_UNCORE_HA] = &ivt_uncore_ha, [IVT_PCI_UNCORE_IMC] = &ivt_uncore_imc, + [IVT_PCI_UNCORE_IRP] = &ivt_uncore_irp, [IVT_PCI_UNCORE_QPI] = &ivt_uncore_qpi, [IVT_PCI_UNCORE_R2PCIE] = &ivt_uncore_r2pcie, [IVT_PCI_UNCORE_R3QPI] = &ivt_uncore_r3qpi, @@ -1256,71 +1463,85 @@ static struct intel_uncore_type *ivt_pci_uncores[] = { static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = { { /* Home Agent 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30), - .driver_data = IVT_PCI_UNCORE_HA, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 0), }, { /* Home Agent 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38), - .driver_data = IVT_PCI_UNCORE_HA, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 1), }, { /* MC0 Channel 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 0), }, { /* MC0 Channel 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 1), }, { /* MC0 Channel 3 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 2), }, { /* MC0 Channel 4 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 3), }, { /* MC1 Channel 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 4), }, { /* MC1 Channel 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 5), }, { /* MC1 Channel 3 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 6), }, { /* MC1 Channel 4 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7), + }, + { /* IRP */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IRP, 0), }, { /* QPI0 Port 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32), - .driver_data = IVT_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0), }, { /* QPI0 Port 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33), - .driver_data = IVT_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 1), }, { /* QPI1 Port 2 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a), - .driver_data = IVT_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 2), }, { /* R2PCIe */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34), - .driver_data = IVT_PCI_UNCORE_R2PCIE, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R2PCIE, 0), }, { /* R3QPI0 Link 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36), - .driver_data = IVT_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 0), }, { /* R3QPI0 Link 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37), - .driver_data = IVT_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 1), }, { /* R3QPI1 Link 2 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e), - .driver_data = IVT_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT0_FILTER), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT1_FILTER), }, { /* end: all zeroes */ } }; @@ -2599,14 +2820,14 @@ static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) box->hrtimer.function = uncore_pmu_hrtimer; } -struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cpu) +static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node) { struct intel_uncore_box *box; int i, size; size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg); - box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); + box = kzalloc_node(size, GFP_KERNEL, node); if (!box) return NULL; @@ -2701,7 +2922,7 @@ uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *eve return c; } - if (event->hw.config == ~0ULL) + if (event->attr.config == UNCORE_FIXED_EVENT) return &constraint_fixed; if (type->constraints) { @@ -2924,7 +3145,7 @@ static int uncore_validate_group(struct intel_uncore_pmu *pmu, struct intel_uncore_box *fake_box; int ret = -EINVAL, n; - fake_box = uncore_alloc_box(pmu->type, smp_processor_id()); + fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE); if (!fake_box) return -ENOMEM; @@ -3005,7 +3226,9 @@ static int uncore_pmu_event_init(struct perf_event *event) */ if (pmu->type->single_fixed && pmu->pmu_idx > 0) return -EINVAL; - hwc->config = ~0ULL; + + /* fixed counters have event field hardcoded to zero */ + hwc->config = 0ULL; } else { hwc->config = event->attr.config & pmu->type->event_mask; if (pmu->type->ops->hw_config) { @@ -3167,17 +3390,25 @@ static bool pcidrv_registered; /* * add a pci uncore device */ -static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) +static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct intel_uncore_pmu *pmu; struct intel_uncore_box *box; - int i, phys_id; + struct intel_uncore_type *type; + int phys_id; phys_id = pcibus_to_physid[pdev->bus->number]; if (phys_id < 0) return -ENODEV; - box = uncore_alloc_box(type, 0); + if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { + extra_pci_dev[phys_id][UNCORE_PCI_DEV_IDX(id->driver_data)] = pdev; + pci_set_drvdata(pdev, NULL); + return 0; + } + + type = pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; + box = uncore_alloc_box(type, NUMA_NO_NODE); if (!box) return -ENOMEM; @@ -3185,21 +3416,11 @@ static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) * for performance monitoring unit with multiple boxes, * each box has a different function id. */ - for (i = 0; i < type->num_boxes; i++) { - pmu = &type->pmus[i]; - if (pmu->func_id == pdev->devfn) - break; - if (pmu->func_id < 0) { - pmu->func_id = pdev->devfn; - break; - } - pmu = NULL; - } - - if (!pmu) { - kfree(box); - return -EINVAL; - } + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; + if (pmu->func_id < 0) + pmu->func_id = pdev->devfn; + else + WARN_ON_ONCE(pmu->func_id != pdev->devfn); box->phys_id = phys_id; box->pci_dev = pdev; @@ -3217,9 +3438,22 @@ static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box = pci_get_drvdata(pdev); - struct intel_uncore_pmu *pmu = box->pmu; - int cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + struct intel_uncore_pmu *pmu; + int i, cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + + box = pci_get_drvdata(pdev); + if (!box) { + for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { + if (extra_pci_dev[phys_id][i] == pdev) { + extra_pci_dev[phys_id][i] = NULL; + break; + } + } + WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX); + return; + } + pmu = box->pmu; if (WARN_ON_ONCE(phys_id != box->phys_id)) return; @@ -3240,12 +3474,6 @@ static void uncore_pci_remove(struct pci_dev *pdev) kfree(box); } -static int uncore_pci_probe(struct pci_dev *pdev, - const struct pci_device_id *id) -{ - return uncore_pci_add(pci_uncores[id->driver_data], pdev); -} - static int __init uncore_pci_init(void) { int ret; @@ -3385,7 +3613,7 @@ static int uncore_cpu_prepare(int cpu, int phys_id) if (pmu->func_id < 0) pmu->func_id = j; - box = uncore_alloc_box(type, cpu); + box = uncore_alloc_box(type, cpu_to_node(cpu)); if (!box) return -ENOMEM; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 47b3d00c9d8..a80ab71a883 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -12,6 +12,15 @@ #define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC #define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) +#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx) +#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff) +#define UNCORE_PCI_DEV_IDX(data) (data & 0xff) +#define UNCORE_EXTRA_PCI_DEV 0xff +#define UNCORE_EXTRA_PCI_DEV_MAX 2 + +/* support up to 8 sockets */ +#define UNCORE_SOCKET_MAX 8 + #define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) /* SNB event control */ @@ -108,6 +117,7 @@ (SNBEP_PMON_CTL_EV_SEL_MASK | \ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_EV_SEL_EXT | \ SNBEP_PMON_CTL_INVERT | \ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index aee6317b902..06fe3ed8b85 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -11,15 +11,12 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, unsigned int cpu) { #ifdef CONFIG_SMP - if (c->x86_max_cores * smp_num_siblings > 1) { - seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); - seq_printf(m, "siblings\t: %d\n", - cpumask_weight(cpu_core_mask(cpu))); - seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); - seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); - seq_printf(m, "apicid\t\t: %d\n", c->apicid); - seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); - } + seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); + seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu))); + seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); + seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); + seq_printf(m, "apicid\t\t: %d\n", c->apicid); + seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); #endif } diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index 202759a1412..75c5ad5d35c 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c @@ -11,8 +11,8 @@ static const struct cpu_dev umc_cpu_dev = { .c_vendor = "UMC", .c_ident = { "UMC UMC UMC" }, - .c_models = { - { .vendor = X86_VENDOR_UMC, .family = 4, .model_names = + .legacy_models = { + { .family = 4, .model_names = { [1] = "U5D", [2] = "U5S", diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 7076878404e..628a059a9a0 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -93,7 +93,7 @@ static void __init vmware_platform_setup(void) * serial key should be enough, as this will always have a VMware * specific string when running under VMware hypervisor. */ -static bool __init vmware_platform(void) +static uint32_t __init vmware_platform(void) { if (cpu_has_hypervisor) { unsigned int eax; @@ -102,12 +102,12 @@ static bool __init vmware_platform(void) cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0], &hyper_vendor_id[1], &hyper_vendor_id[2]); if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) - return true; + return CPUID_VMWARE_INFO_LEAF; } else if (dmi_available && dmi_name_in_serial("VMware") && __vmware_platform()) - return true; + return 1; - return false; + return 0; } /* diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 74467feb4dc..18677a90d6a 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -127,10 +127,12 @@ void native_machine_crash_shutdown(struct pt_regs *regs) cpu_emergency_vmxoff(); cpu_emergency_svm_disable(); - lapic_shutdown(); -#if defined(CONFIG_X86_IO_APIC) +#ifdef CONFIG_X86_IO_APIC + /* Prevent crash_kexec() from deadlocking on ioapic_lock. */ + ioapic_zap_locks(); disable_IO_APIC(); #endif + lapic_shutdown(); #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 69eb2fa2549..376dc787344 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -52,8 +52,7 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) } #ifdef CONFIG_BLK_DEV_INITRD -void __init early_init_dt_setup_initrd_arch(unsigned long start, - unsigned long end) +void __init early_init_dt_setup_initrd_arch(u64 start, u64 end) { initrd_start = (unsigned long)__va(start); initrd_end = (unsigned long)__va(end); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d32abeabbda..174da5fc5a7 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -658,15 +658,18 @@ __init void e820_setup_gap(void) * boot_params.e820_map, others are passed via SETUP_E820_EXT node of * linked list of struct setup_data, which is parsed here. */ -void __init parse_e820_ext(struct setup_data *sdata) +void __init parse_e820_ext(u64 phys_addr, u32 data_len) { int entries; struct e820entry *extmap; + struct setup_data *sdata; + sdata = early_memremap(phys_addr, data_len); entries = sdata->len / sizeof(struct e820entry); extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + early_iounmap(sdata, data_len); printk(KERN_INFO "e820: extended physical RAM map:\n"); e820_print_map("extended"); } diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 63bdb29b254..b3cd3ebae07 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -12,6 +12,7 @@ #include <linux/pci.h> #include <linux/acpi.h> #include <linux/pci_ids.h> +#include <drm/i915_drm.h> #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/io_apic.h> @@ -216,6 +217,157 @@ static void __init intel_remapping_check(int num, int slot, int func) } +/* + * Systems with Intel graphics controllers set aside memory exclusively + * for gfx driver use. This memory is not marked in the E820 as reserved + * or as RAM, and so is subject to overlap from E820 manipulation later + * in the boot process. On some systems, MMIO space is allocated on top, + * despite the efforts of the "RAM buffer" approach, which simply rounds + * memory boundaries up to 64M to try to catch space that may decode + * as RAM and so is not suitable for MMIO. + * + * And yes, so far on current devices the base addr is always under 4G. + */ +static u32 __init intel_stolen_base(int num, int slot, int func) +{ + u32 base; + + /* + * For the PCI IDs in this quirk, the stolen base is always + * in 0x5c, aka the BDSM register (yes that's really what + * it's called). + */ + base = read_pci_config(num, slot, func, 0x5c); + base &= ~((1<<20) - 1); + + return base; +} + +#define KB(x) ((x) * 1024) +#define MB(x) (KB (KB (x))) +#define GB(x) (MB (KB (x))) + +static size_t __init gen3_stolen_size(int num, int slot, int func) +{ + size_t stolen_size; + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); + + switch (gmch_ctrl & I855_GMCH_GMS_MASK) { + case I855_GMCH_GMS_STOLEN_1M: + stolen_size = MB(1); + break; + case I855_GMCH_GMS_STOLEN_4M: + stolen_size = MB(4); + break; + case I855_GMCH_GMS_STOLEN_8M: + stolen_size = MB(8); + break; + case I855_GMCH_GMS_STOLEN_16M: + stolen_size = MB(16); + break; + case I855_GMCH_GMS_STOLEN_32M: + stolen_size = MB(32); + break; + case I915_GMCH_GMS_STOLEN_48M: + stolen_size = MB(48); + break; + case I915_GMCH_GMS_STOLEN_64M: + stolen_size = MB(64); + break; + case G33_GMCH_GMS_STOLEN_128M: + stolen_size = MB(128); + break; + case G33_GMCH_GMS_STOLEN_256M: + stolen_size = MB(256); + break; + case INTEL_GMCH_GMS_STOLEN_96M: + stolen_size = MB(96); + break; + case INTEL_GMCH_GMS_STOLEN_160M: + stolen_size = MB(160); + break; + case INTEL_GMCH_GMS_STOLEN_224M: + stolen_size = MB(224); + break; + case INTEL_GMCH_GMS_STOLEN_352M: + stolen_size = MB(352); + break; + default: + stolen_size = 0; + break; + } + + return stolen_size; +} + +static size_t __init gen6_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gmch_ctrl >>= SNB_GMCH_GMS_SHIFT; + gmch_ctrl &= SNB_GMCH_GMS_MASK; + + return gmch_ctrl << 25; /* 32 MB units */ +} + +typedef size_t (*stolen_size_fn)(int num, int slot, int func); + +static struct pci_device_id intel_stolen_ids[] __initdata = { + INTEL_I915G_IDS(gen3_stolen_size), + INTEL_I915GM_IDS(gen3_stolen_size), + INTEL_I945G_IDS(gen3_stolen_size), + INTEL_I945GM_IDS(gen3_stolen_size), + INTEL_VLV_M_IDS(gen3_stolen_size), + INTEL_VLV_D_IDS(gen3_stolen_size), + INTEL_PINEVIEW_IDS(gen3_stolen_size), + INTEL_I965G_IDS(gen3_stolen_size), + INTEL_G33_IDS(gen3_stolen_size), + INTEL_I965GM_IDS(gen3_stolen_size), + INTEL_GM45_IDS(gen3_stolen_size), + INTEL_G45_IDS(gen3_stolen_size), + INTEL_IRONLAKE_D_IDS(gen3_stolen_size), + INTEL_IRONLAKE_M_IDS(gen3_stolen_size), + INTEL_SNB_D_IDS(gen6_stolen_size), + INTEL_SNB_M_IDS(gen6_stolen_size), + INTEL_IVB_M_IDS(gen6_stolen_size), + INTEL_IVB_D_IDS(gen6_stolen_size), + INTEL_HSW_D_IDS(gen6_stolen_size), + INTEL_HSW_M_IDS(gen6_stolen_size), +}; + +static void __init intel_graphics_stolen(int num, int slot, int func) +{ + size_t size; + int i; + u32 start; + u16 device, subvendor, subdevice; + + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); + subvendor = read_pci_config_16(num, slot, func, + PCI_SUBSYSTEM_VENDOR_ID); + subdevice = read_pci_config_16(num, slot, func, PCI_SUBSYSTEM_ID); + + for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) { + if (intel_stolen_ids[i].device == device) { + stolen_size_fn stolen_size = + (stolen_size_fn)intel_stolen_ids[i].driver_data; + size = stolen_size(num, slot, func); + start = intel_stolen_base(num, slot, func); + if (size && start) { + /* Mark this space as reserved */ + e820_add_region(start, size, E820_RESERVED); + sanitize_e820_map(e820.map, + ARRAY_SIZE(e820.map), + &e820.nr_map); + } + return; + } + } +} + #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -251,6 +403,8 @@ static struct chipset early_qrk[] __initdata = { PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, + { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, + QFLAG_APPLY_ONCE, intel_graphics_stolen }, {} }; diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index d15f575a861..01d1c187c9f 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -14,9 +14,11 @@ #include <xen/hvc-console.h> #include <asm/pci-direct.h> #include <asm/fixmap.h> -#include <asm/mrst.h> +#include <asm/intel-mid.h> #include <asm/pgtable.h> #include <linux/usb/ehci_def.h> +#include <linux/efi.h> +#include <asm/efi.h> /* Simple VGA output */ #define VGABASE (__ISA_IO_base + 0xb8000) @@ -234,6 +236,11 @@ static int __init setup_early_printk(char *buf) early_console_register(&early_hsu_console, keep); } #endif +#ifdef CONFIG_EARLY_PRINTK_EFI + if (!strncmp(buf, "efi", 3)) + early_console_register(&early_efi_console, keep); +#endif + buf++; } return 0; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 2cfbc3a3a2d..fd1bc1b15e6 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -362,12 +362,9 @@ END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) - cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? - jnz restore_all need_resched: - movl TI_flags(%ebp), %ecx # need_resched set ? - testb $_TIF_NEED_RESCHED, %cl - jz restore_all + cmpl $0,PER_CPU_VAR(__preempt_count) + jnz restore_all testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all call preempt_schedule_irq @@ -1176,6 +1173,9 @@ ftrace_restore_flags: #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) + cmpl $__PAGE_OFFSET, %esp + jb ftrace_stub /* Paging not enabled yet? */ + cmpl $0, function_trace_stop jne ftrace_stub diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1b69951a81e..603be7c7067 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -487,21 +487,6 @@ ENDPROC(native_usergs_sysret64) TRACE_IRQS_OFF .endm -ENTRY(save_rest) - PARTIAL_FRAME 1 (REST_SKIP+8) - movq 5*8+16(%rsp), %r11 /* save return address */ - movq_cfi rbx, RBX+16 - movq_cfi rbp, RBP+16 - movq_cfi r12, R12+16 - movq_cfi r13, R13+16 - movq_cfi r14, R14+16 - movq_cfi r15, R15+16 - movq %r11, 8(%rsp) /* return address */ - FIXUP_TOP_OF_STACK %r11, 16 - ret - CFI_ENDPROC -END(save_rest) - /* save complete stack frame */ .pushsection .kprobes.text, "ax" ENTRY(save_paranoid) @@ -1118,10 +1103,8 @@ retint_signal: /* Returning to kernel space. Check if we need preemption */ /* rcx: threadinfo. interrupts off. */ ENTRY(retint_kernel) - cmpl $0,TI_preempt_count(%rcx) + cmpl $0,PER_CPU_VAR(__preempt_count) jnz retint_restore_args - bt $TIF_NEED_RESCHED,TI_flags(%rcx) - jnc retint_restore_args bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ jnc retint_restore_args call preempt_schedule_irq @@ -1357,7 +1340,7 @@ bad_gs: .previous /* Call softirq on interrupt stack. Interrupts are off. */ -ENTRY(call_softirq) +ENTRY(do_softirq_own_stack) CFI_STARTPROC pushq_cfi %rbp CFI_REL_OFFSET rbp,0 @@ -1374,7 +1357,7 @@ ENTRY(call_softirq) decl PER_CPU_VAR(irq_count) ret CFI_ENDPROC -END(call_softirq) +END(do_softirq_own_stack) #ifdef CONFIG_XEN zeroentry xen_hypervisor_callback xen_do_hypervisor_callback diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 138463a2487..c61a14a4a31 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,14 +29,14 @@ static void __init i386_default_early_setup(void) reserve_ebda_region(); } -void __init i386_start_kernel(void) +asmlinkage void __init i386_start_kernel(void) { sanitize_boot_params(&boot_params); /* Call the subarch specific early setup function */ switch (boot_params.hdr.hardware_subarch) { - case X86_SUBARCH_MRST: - x86_mrst_early_setup(); + case X86_SUBARCH_INTEL_MID: + x86_intel_mid_early_setup(); break; case X86_SUBARCH_CE4100: x86_ce4100_early_setup(); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 55b67614ed9..1be8e43b669 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -137,7 +137,7 @@ static void __init copy_bootdata(char *real_mode_data) } } -void __init x86_64_start_kernel(char * real_mode_data) +asmlinkage void __init x86_64_start_kernel(char * real_mode_data) { int i; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 5dd87a89f01..81ba27679f1 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -409,6 +409,7 @@ enable_paging: /* * Check if it is 486 */ + movb $4,X86 # at least 486 cmpl $-1,X86_CPUID je is486 @@ -436,7 +437,6 @@ enable_paging: movl %edx,X86_CAPABILITY is486: - movb $4,X86 movl $0x50022,%ecx # set AM, WP, NE and MP movl %cr0,%eax andl $0x80000011,%eax # Save PG,PE,ET diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 0fa69127209..05fd74f537d 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -37,3 +37,10 @@ EXPORT_SYMBOL(strstr); EXPORT_SYMBOL(csum_partial); EXPORT_SYMBOL(empty_zero_page); + +#ifdef CONFIG_PREEMPT +EXPORT_SYMBOL(___preempt_schedule); +#ifdef CONFIG_CONTEXT_TRACKING +EXPORT_SYMBOL(___preempt_schedule_context); +#endif +#endif diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 9a5c460404d..2e977b5d61d 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -312,8 +312,7 @@ static void init_8259A(int auto_eoi) */ outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64, - to 0x20-0x27 on i386 */ + /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3a8185c042a..22d0687e7fd 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -177,7 +177,7 @@ u64 arch_irq_stat(void) * SMP cross-CPU interrupts have their own specific * handlers). */ -unsigned int __irq_entry do_IRQ(struct pt_regs *regs) +__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -215,7 +215,7 @@ void __smp_x86_platform_ipi(void) x86_platform_ipi_callback(); } -void smp_x86_platform_ipi(struct pt_regs *regs) +__visible void smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -229,7 +229,7 @@ void smp_x86_platform_ipi(struct pt_regs *regs) /* * Handler for POSTED_INTERRUPT_VECTOR. */ -void smp_kvm_posted_intr_ipi(struct pt_regs *regs) +__visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -247,7 +247,7 @@ void smp_kvm_posted_intr_ipi(struct pt_regs *regs) } #endif -void smp_trace_x86_platform_ipi(struct pt_regs *regs) +__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 4186755f1d7..d7fcbedc9c4 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -100,9 +100,6 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) irqctx->tinfo.task = curctx->tinfo.task; irqctx->tinfo.previous_esp = current_stack_pointer; - /* Copy the preempt_count so that the [soft]irq checks work. */ - irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count; - if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); @@ -131,7 +128,6 @@ void irq_ctx_init(int cpu) THREAD_SIZE_ORDER)); memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); irqctx->tinfo.cpu = cpu; - irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); per_cpu(hardirq_ctx, cpu) = irqctx; @@ -149,35 +145,21 @@ void irq_ctx_init(int cpu) cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); } -asmlinkage void do_softirq(void) +void do_softirq_own_stack(void) { - unsigned long flags; struct thread_info *curctx; union irq_ctx *irqctx; u32 *isp; - if (in_interrupt()) - return; - - local_irq_save(flags); - - if (local_softirq_pending()) { - curctx = current_thread_info(); - irqctx = __this_cpu_read(softirq_ctx); - irqctx->tinfo.task = curctx->task; - irqctx->tinfo.previous_esp = current_stack_pointer; - - /* build the stack frame on the softirq stack */ - isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); + curctx = current_thread_info(); + irqctx = __this_cpu_read(softirq_ctx); + irqctx->tinfo.task = curctx->task; + irqctx->tinfo.previous_esp = current_stack_pointer; - call_on_stack(__do_softirq, isp); - /* - * Shouldn't happen, we returned above if in_interrupt(): - */ - WARN_ON_ONCE(softirq_count()); - } + /* build the stack frame on the softirq stack */ + isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); - local_irq_restore(flags); + call_on_stack(__do_softirq, isp); } bool handle_irq(unsigned irq, struct pt_regs *regs) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index d04d3ecded6..4d1c746892e 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -87,24 +87,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) generic_handle_irq_desc(irq, desc); return true; } - - -extern void call_softirq(void); - -asmlinkage void do_softirq(void) -{ - __u32 pending; - unsigned long flags; - - if (in_interrupt()) - return; - - local_irq_save(flags); - pending = local_softirq_pending(); - /* Switch to interrupt stack */ - if (pending) { - call_softirq(); - WARN_ON_ONCE(softirq_count()); - } - local_irq_restore(flags); -} diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 636a55e4a13..1de84e3ab4e 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -22,14 +22,14 @@ static inline void __smp_irq_work_interrupt(void) irq_work_run(); } -void smp_irq_work_interrupt(struct pt_regs *regs) +__visible void smp_irq_work_interrupt(struct pt_regs *regs) { irq_work_entering_irq(); __smp_irq_work_interrupt(); exiting_irq(); } -void smp_trace_irq_work_interrupt(struct pt_regs *regs) +__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) { irq_work_entering_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 2889b3d4388..ee11b7dfbfb 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -24,20 +24,71 @@ union jump_code_union { } __attribute__((packed)); }; +static void bug_at(unsigned char *ip, int line) +{ + /* + * The location is not an op that we were expecting. + * Something went wrong. Crash the box, as something could be + * corrupting the kernel. + */ + pr_warning("Unexpected op at %pS [%p] (%02x %02x %02x %02x %02x) %s:%d\n", + ip, ip, ip[0], ip[1], ip[2], ip[3], ip[4], __FILE__, line); + BUG(); +} + static void __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, - void *(*poker)(void *, const void *, size_t)) + void *(*poker)(void *, const void *, size_t), + int init) { union jump_code_union code; + const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; if (type == JUMP_LABEL_ENABLE) { + /* + * We are enabling this jump label. If it is not a nop + * then something must have gone wrong. + */ + if (unlikely(memcmp((void *)entry->code, ideal_nop, 5) != 0)) + bug_at((void *)entry->code, __LINE__); + code.jump = 0xe9; code.offset = entry->target - (entry->code + JUMP_LABEL_NOP_SIZE); - } else + } else { + /* + * We are disabling this jump label. If it is not what + * we think it is, then something must have gone wrong. + * If this is the first initialization call, then we + * are converting the default nop to the ideal nop. + */ + if (init) { + const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; + if (unlikely(memcmp((void *)entry->code, default_nop, 5) != 0)) + bug_at((void *)entry->code, __LINE__); + } else { + code.jump = 0xe9; + code.offset = entry->target - + (entry->code + JUMP_LABEL_NOP_SIZE); + if (unlikely(memcmp((void *)entry->code, &code, 5) != 0)) + bug_at((void *)entry->code, __LINE__); + } memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); + } - (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); + /* + * Make text_poke_bp() a default fallback poker. + * + * At the time the change is being done, just ignore whether we + * are doing nop -> jump or jump -> nop transition, and assume + * always nop being the 'currently valid' instruction + * + */ + if (poker) + (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); + else + text_poke_bp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE, + (void *)entry->code + JUMP_LABEL_NOP_SIZE); } void arch_jump_label_transform(struct jump_entry *entry, @@ -45,15 +96,38 @@ void arch_jump_label_transform(struct jump_entry *entry, { get_online_cpus(); mutex_lock(&text_mutex); - __jump_label_transform(entry, type, text_poke_smp); + __jump_label_transform(entry, type, NULL, 0); mutex_unlock(&text_mutex); put_online_cpus(); } +static enum { + JL_STATE_START, + JL_STATE_NO_UPDATE, + JL_STATE_UPDATE, +} jlstate __initdata_or_module = JL_STATE_START; + __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { - __jump_label_transform(entry, type, text_poke_early); + /* + * This function is called at boot up and when modules are + * first loaded. Check if the default nop, the one that is + * inserted at compile time, is the ideal nop. If it is, then + * we do not need to update the nop, and we can leave it as is. + * If it is not, then we need to update the nop to the ideal nop. + */ + if (jlstate == JL_STATE_START) { + const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; + const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; + + if (memcmp(ideal_nop, default_nop, 5) != 0) + jlstate = JL_STATE_UPDATE; + else + jlstate = JL_STATE_NO_UPDATE; + } + if (jlstate == JL_STATE_UPDATE) + __jump_label_transform(entry, type, text_poke_early, 1); } #endif diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index 2e9d4b5af03..c6ee63f927a 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -82,14 +82,9 @@ extern void synthesize_reljump(void *from, void *to); extern void synthesize_relcall(void *from, void *to); #ifdef CONFIG_OPTPROBES -extern int arch_init_optprobes(void); extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter); extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr); #else /* !CONFIG_OPTPROBES */ -static inline int arch_init_optprobes(void) -{ - return 0; -} static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) { return 0; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 211bce44552..79a3f968287 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -661,7 +661,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void) /* * Called from kretprobe_trampoline */ -static __used __kprobes void *trampoline_handler(struct pt_regs *regs) +__visible __used __kprobes void *trampoline_handler(struct pt_regs *regs) { struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; @@ -1068,7 +1068,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) int __init arch_init_kprobes(void) { - return arch_init_optprobes(); + return 0; } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 76dc6f09572..898160b42e4 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -88,9 +88,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long v *(unsigned long *)addr = val; } -static void __used __kprobes kprobes_optinsn_template_holder(void) -{ - asm volatile ( +asm ( ".global optprobe_template_entry\n" "optprobe_template_entry:\n" #ifdef CONFIG_X86_64 @@ -129,7 +127,6 @@ static void __used __kprobes kprobes_optinsn_template_holder(void) #endif ".global optprobe_template_end\n" "optprobe_template_end:\n"); -} #define TMPL_MOVE_IDX \ ((long)&optprobe_template_val - (long)&optprobe_template_entry) @@ -371,31 +368,6 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) return 0; } -#define MAX_OPTIMIZE_PROBES 256 -static struct text_poke_param *jump_poke_params; -static struct jump_poke_buffer { - u8 buf[RELATIVEJUMP_SIZE]; -} *jump_poke_bufs; - -static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, - u8 *insn_buf, - struct optimized_kprobe *op) -{ - s32 rel = (s32)((long)op->optinsn.insn - - ((long)op->kp.addr + RELATIVEJUMP_SIZE)); - - /* Backup instructions which will be replaced by jump address */ - memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, - RELATIVE_ADDR_SIZE); - - insn_buf[0] = RELATIVEJUMP_OPCODE; - *(s32 *)(&insn_buf[1]) = rel; - - tprm->addr = op->kp.addr; - tprm->opcode = insn_buf; - tprm->len = RELATIVEJUMP_SIZE; -} - /* * Replace breakpoints (int3) with relative jumps. * Caller must call with locking kprobe_mutex and text_mutex. @@ -403,37 +375,38 @@ static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, void __kprobes arch_optimize_kprobes(struct list_head *oplist) { struct optimized_kprobe *op, *tmp; - int c = 0; + u8 insn_buf[RELATIVEJUMP_SIZE]; list_for_each_entry_safe(op, tmp, oplist, list) { + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + WARN_ON(kprobe_disabled(&op->kp)); - /* Setup param */ - setup_optimize_kprobe(&jump_poke_params[c], - jump_poke_bufs[c].buf, op); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, + RELATIVE_ADDR_SIZE); + + insn_buf[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&insn_buf[1]) = rel; + + text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE, + op->optinsn.insn); + list_del_init(&op->list); - if (++c >= MAX_OPTIMIZE_PROBES) - break; } - - /* - * text_poke_smp doesn't support NMI/MCE code modifying. - * However, since kprobes itself also doesn't support NMI/MCE - * code probing, it's not a problem. - */ - text_poke_smp_batch(jump_poke_params, c); } -static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, - u8 *insn_buf, - struct optimized_kprobe *op) +/* Replace a relative jump with a breakpoint (int3). */ +void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) { + u8 insn_buf[RELATIVEJUMP_SIZE]; + /* Set int3 to first byte for kprobes */ insn_buf[0] = BREAKPOINT_INSTRUCTION; memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - - tprm->addr = op->kp.addr; - tprm->opcode = insn_buf; - tprm->len = RELATIVEJUMP_SIZE; + text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE, + op->optinsn.insn); } /* @@ -444,34 +417,11 @@ extern void arch_unoptimize_kprobes(struct list_head *oplist, struct list_head *done_list) { struct optimized_kprobe *op, *tmp; - int c = 0; list_for_each_entry_safe(op, tmp, oplist, list) { - /* Setup param */ - setup_unoptimize_kprobe(&jump_poke_params[c], - jump_poke_bufs[c].buf, op); + arch_unoptimize_kprobe(op); list_move(&op->list, done_list); - if (++c >= MAX_OPTIMIZE_PROBES) - break; } - - /* - * text_poke_smp doesn't support NMI/MCE code modifying. - * However, since kprobes itself also doesn't support NMI/MCE - * code probing, it's not a problem. - */ - text_poke_smp_batch(jump_poke_params, c); -} - -/* Replace a relative jump with a breakpoint (int3). */ -void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) -{ - u8 buf[RELATIVEJUMP_SIZE]; - - /* Set int3 to first byte for kprobes */ - buf[0] = BREAKPOINT_INSTRUCTION; - memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); } int __kprobes @@ -491,22 +441,3 @@ setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) } return 0; } - -int __kprobes arch_init_optprobes(void) -{ - /* Allocate code buffer and parameter array */ - jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * - MAX_OPTIMIZE_PROBES, GFP_KERNEL); - if (!jump_poke_bufs) - return -ENOMEM; - - jump_poke_params = kmalloc(sizeof(struct text_poke_param) * - MAX_OPTIMIZE_PROBES, GFP_KERNEL); - if (!jump_poke_params) { - kfree(jump_poke_bufs); - jump_poke_bufs = NULL; - return -ENOMEM; - } - - return 0; -} diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a96d32cc55b..b2046e4d0b5 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -34,6 +34,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/kprobes.h> +#include <linux/debugfs.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -419,6 +420,7 @@ static void __init kvm_smp_prepare_boot_cpu(void) WARN_ON(kvm_register_clock("primary cpu clock")); kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); + kvm_spinlock_init(); } static void kvm_guest_cpu_online(void *dummy) @@ -498,11 +500,9 @@ void __init kvm_guest_init(void) #endif } -static bool __init kvm_detect(void) +static uint32_t __init kvm_detect(void) { - if (!kvm_para_available()) - return false; - return true; + return kvm_cpuid_base(); } const struct hypervisor_x86 x86_hyper_kvm __refconst = { @@ -523,3 +523,274 @@ static __init int activate_jump_labels(void) return 0; } arch_initcall(activate_jump_labels); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + +/* Kick a cpu by its apicid. Used to wake up a halted vcpu */ +static void kvm_kick_cpu(int cpu) +{ + int apicid; + unsigned long flags = 0; + + apicid = per_cpu(x86_cpu_to_apicid, cpu); + kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); +} + +enum kvm_contention_stat { + TAKEN_SLOW, + TAKEN_SLOW_PICKUP, + RELEASED_SLOW, + RELEASED_SLOW_KICKED, + NR_CONTENTION_STATS +}; + +#ifdef CONFIG_KVM_DEBUG_FS +#define HISTO_BUCKETS 30 + +static struct kvm_spinlock_stats +{ + u32 contention_stats[NR_CONTENTION_STATS]; + u32 histo_spin_blocked[HISTO_BUCKETS+1]; + u64 time_blocked; +} spinlock_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + u8 ret; + u8 old; + + old = ACCESS_ONCE(zero_stats); + if (unlikely(old)) { + ret = cmpxchg(&zero_stats, old, 0); + /* This ensures only one fellow resets the stat */ + if (ret == old) + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); + } +} + +static inline void add_stats(enum kvm_contention_stat var, u32 val) +{ + check_zero(); + spinlock_stats.contention_stats[var] += val; +} + + +static inline u64 spin_time_start(void) +{ + return sched_clock(); +} + +static void __spin_time_accum(u64 delta, u32 *array) +{ + unsigned index; + + index = ilog2(delta); + check_zero(); + + if (index < HISTO_BUCKETS) + array[index]++; + else + array[HISTO_BUCKETS]++; +} + +static inline void spin_time_accum_blocked(u64 start) +{ + u32 delta; + + delta = sched_clock() - start; + __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); + spinlock_stats.time_blocked += delta; +} + +static struct dentry *d_spin_debug; +static struct dentry *d_kvm_debug; + +struct dentry *kvm_init_debugfs(void) +{ + d_kvm_debug = debugfs_create_dir("kvm-guest", NULL); + if (!d_kvm_debug) + printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n"); + + return d_kvm_debug; +} + +static int __init kvm_spinlock_debugfs(void) +{ + struct dentry *d_kvm; + + d_kvm = kvm_init_debugfs(); + if (d_kvm == NULL) + return -ENOMEM; + + d_spin_debug = debugfs_create_dir("spinlocks", d_kvm); + + debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); + + debugfs_create_u32("taken_slow", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW]); + debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); + + debugfs_create_u32("released_slow", 0444, d_spin_debug, + &spinlock_stats.contention_stats[RELEASED_SLOW]); + debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, + &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); + + debugfs_create_u64("time_blocked", 0444, d_spin_debug, + &spinlock_stats.time_blocked); + + debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); + + return 0; +} +fs_initcall(kvm_spinlock_debugfs); +#else /* !CONFIG_KVM_DEBUG_FS */ +static inline void add_stats(enum kvm_contention_stat var, u32 val) +{ +} + +static inline u64 spin_time_start(void) +{ + return 0; +} + +static inline void spin_time_accum_blocked(u64 start) +{ +} +#endif /* CONFIG_KVM_DEBUG_FS */ + +struct kvm_lock_waiting { + struct arch_spinlock *lock; + __ticket_t want; +}; + +/* cpus 'waiting' on a spinlock to become available */ +static cpumask_t waiting_cpus; + +/* Track spinlock on which a cpu is waiting */ +static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting); + +static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) +{ + struct kvm_lock_waiting *w; + int cpu; + u64 start; + unsigned long flags; + + if (in_nmi()) + return; + + w = &__get_cpu_var(klock_waiting); + cpu = smp_processor_id(); + start = spin_time_start(); + + /* + * Make sure an interrupt handler can't upset things in a + * partially setup state. + */ + local_irq_save(flags); + + /* + * The ordering protocol on this is that the "lock" pointer + * may only be set non-NULL if the "want" ticket is correct. + * If we're updating "want", we must first clear "lock". + */ + w->lock = NULL; + smp_wmb(); + w->want = want; + smp_wmb(); + w->lock = lock; + + add_stats(TAKEN_SLOW, 1); + + /* + * This uses set_bit, which is atomic but we should not rely on its + * reordering gurantees. So barrier is needed after this call. + */ + cpumask_set_cpu(cpu, &waiting_cpus); + + barrier(); + + /* + * Mark entry to slowpath before doing the pickup test to make + * sure we don't deadlock with an unlocker. + */ + __ticket_enter_slowpath(lock); + + /* + * check again make sure it didn't become free while + * we weren't looking. + */ + if (ACCESS_ONCE(lock->tickets.head) == want) { + add_stats(TAKEN_SLOW_PICKUP, 1); + goto out; + } + + /* + * halt until it's our turn and kicked. Note that we do safe halt + * for irq enabled case to avoid hang when lock info is overwritten + * in irq spinlock slowpath and no spurious interrupt occur to save us. + */ + if (arch_irqs_disabled_flags(flags)) + halt(); + else + safe_halt(); + +out: + cpumask_clear_cpu(cpu, &waiting_cpus); + w->lock = NULL; + local_irq_restore(flags); + spin_time_accum_blocked(start); +} +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning); + +/* Kick vcpu waiting on @lock->head to reach value @ticket */ +static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) +{ + int cpu; + + add_stats(RELEASED_SLOW, 1); + for_each_cpu(cpu, &waiting_cpus) { + const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu); + if (ACCESS_ONCE(w->lock) == lock && + ACCESS_ONCE(w->want) == ticket) { + add_stats(RELEASED_SLOW_KICKED, 1); + kvm_kick_cpu(cpu); + break; + } + } +} + +/* + * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. + */ +void __init kvm_spinlock_init(void) +{ + if (!kvm_para_available()) + return; + /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) + return; + + pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); + pv_lock_ops.unlock_kick = kvm_unlock_kick; +} + +static __init int kvm_spinlock_init_jump(void) +{ + if (!kvm_para_available()) + return 0; + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) + return 0; + + static_key_slow_inc(¶virt_ticketlocks_enabled); + printk(KERN_INFO "KVM setup paravirtual spinlock\n"); + + return 0; +} +early_initcall(kvm_spinlock_init_jump); + +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 7123b5df479..af99f71aeb7 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -216,6 +216,7 @@ int apply_microcode_amd(int cpu) /* need to apply patch? */ if (rev >= mc_amd->hdr.patch_id) { c->microcode = rev; + uci->cpu_sig.rev = rev; return 0; } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 88458faea2f..05266b5aae2 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -46,7 +46,7 @@ static struct class *msr_class; static loff_t msr_seek(struct file *file, loff_t offset, int orig) { loff_t ret; - struct inode *inode = file->f_mapping->host; + struct inode *inode = file_inode(file); mutex_lock(&inode->i_mutex); switch (orig) { diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ba77ebc2c35..6fcb49ce50a 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -113,10 +113,10 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2 u64 before, delta, whole_msecs; int remainder_ns, decimal_msecs, thishandled; - before = local_clock(); + before = sched_clock(); thishandled = a->handler(type, regs); handled += thishandled; - delta = local_clock() - before; + delta = sched_clock() - before; trace_nmi_handler(a->handler, (int)delta, thishandled); if (delta < nmi_longest_ns) diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 676b8c77a97..bbb6c731634 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -4,25 +4,17 @@ */ #include <linux/spinlock.h> #include <linux/module.h> +#include <linux/jump_label.h> #include <asm/paravirt.h> -static inline void -default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) -{ - arch_spin_lock(lock); -} - struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP - .spin_is_locked = __ticket_spin_is_locked, - .spin_is_contended = __ticket_spin_is_contended, - - .spin_lock = __ticket_spin_lock, - .spin_lock_flags = default_spin_lock_flags, - .spin_trylock = __ticket_spin_trylock, - .spin_unlock = __ticket_spin_unlock, + .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop), + .unlock_kick = paravirt_nop, #endif }; EXPORT_SYMBOL(pv_lock_ops); +struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL(paravirt_ticketlocks_enabled); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index cd6de64cc48..1b10af835c3 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -62,11 +62,6 @@ void __init default_banner(void) pv_info.name); } -/* Simple instruction patching code. */ -#define DEF_NATIVE(ops, name, code) \ - extern const char start_##ops##_##name[], end_##ops##_##name[]; \ - asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") - /* Undefined instruction for dealing with missing ops pointers. */ static const unsigned char ud2a[] = { 0x0f, 0x0b }; @@ -324,7 +319,7 @@ struct pv_time_ops pv_time_ops = { .steal_clock = native_steal_clock, }; -struct pv_irq_ops pv_irq_ops = { +__visible struct pv_irq_ops pv_irq_ops = { .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), @@ -336,7 +331,7 @@ struct pv_irq_ops pv_irq_ops = { #endif }; -struct pv_cpu_ops pv_cpu_ops = { +__visible struct pv_cpu_ops pv_cpu_ops = { .cpuid = native_cpuid, .get_debugreg = native_get_debugreg, .set_debugreg = native_set_debugreg, diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S new file mode 100644 index 00000000000..ca7f0d58a87 --- /dev/null +++ b/arch/x86/kernel/preempt.S @@ -0,0 +1,25 @@ + +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/asm.h> +#include <asm/calling.h> + +ENTRY(___preempt_schedule) + CFI_STARTPROC + SAVE_ALL + call preempt_schedule + RESTORE_ALL + ret + CFI_ENDPROC + +#ifdef CONFIG_CONTEXT_TRACKING + +ENTRY(___preempt_schedule_context) + CFI_STARTPROC + SAVE_ALL + call preempt_schedule_context + RESTORE_ALL + ret + CFI_ENDPROC + +#endif diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 83369e5a1d2..3fb8d95ab8b 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -36,7 +36,7 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); @@ -391,9 +391,9 @@ static void amd_e400_idle(void) * The switch back from broadcast mode needs to be * called with interrupts disabled. */ - local_irq_disable(); - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); - local_irq_enable(); + local_irq_disable(); + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); + local_irq_enable(); } else default_idle(); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f8adefca71d..c2ec1aa6d45 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -247,7 +247,7 @@ EXPORT_SYMBOL_GPL(start_thread); * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ -__notrace_funcgraph struct task_struct * +__visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, @@ -292,6 +292,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) set_iopl_mask(next->iopl); /* + * If it were not for PREEMPT_ACTIVE we could guarantee that the + * preempt_count of all tasks was equal here and this would not be + * needed. + */ + task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); + this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); + + /* * Now maybe handle debug registers and/or IO bitmaps */ if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 05646bab4ca..45ab4d6fc8a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ asmlinkage extern void ret_from_fork(void); -DEFINE_PER_CPU(unsigned long, old_rsp); +asmlinkage DEFINE_PER_CPU(unsigned long, old_rsp); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) @@ -274,7 +274,7 @@ void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) * Kprobes not supported here. Set the probe on schedule instead. * Function graph tracer not supported too. */ -__notrace_funcgraph struct task_struct * +__visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; @@ -363,6 +363,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(old_rsp, next->usersp); this_cpu_write(current_task, next_p); + /* + * If it were not for PREEMPT_ACTIVE we could guarantee that the + * preempt_count of all tasks was equal here and this would not be + * needed. + */ + task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); + this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); + this_cpu_write(kernel_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE - KERNEL_STACK_OFFSET); diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 2cb9470ea85..a16bae3f83b 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } -static struct pvclock_vsyscall_time_info *pvclock_vdso_info; - -static struct pvclock_vsyscall_time_info * -pvclock_get_vsyscall_user_time_info(int cpu) -{ - if (!pvclock_vdso_info) { - BUG(); - return NULL; - } - - return &pvclock_vdso_info[cpu]; -} - -struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) -{ - return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; -} - #ifdef CONFIG_X86_64 -static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, - void *v) -{ - struct task_migration_notifier *mn = v; - struct pvclock_vsyscall_time_info *pvti; - - pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); - - /* this is NULL when pvclock vsyscall is not initialized */ - if (unlikely(pvti == NULL)) - return NOTIFY_DONE; - - pvti->migrate_count++; - - return NOTIFY_DONE; -} - -static struct notifier_block pvclock_migrate = { - .notifier_call = pvclock_task_migrate, -}; - /* * Initialize the generic pvclock vsyscall state. This will allocate * a/some page(s) for the per-vcpu pvclock information, set up a @@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); - pvclock_vdso_info = i; - for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, __pa(i) + (idx*PAGE_SIZE), PAGE_KERNEL_VVAR); } - - register_task_migration_notifier(&pvclock_migrate); - return 0; } #endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 563ed91e6fa..da3c599584a 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -61,7 +61,7 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) if (reboot_type != BOOT_BIOS) { reboot_type = BOOT_BIOS; pr_info("%s series board detected. Selecting %s-method for reboots.\n", - "BIOS", d->ident); + d->ident, "BIOS"); } return 0; } @@ -117,7 +117,7 @@ static int __init set_pci_reboot(const struct dmi_system_id *d) if (reboot_type != BOOT_CF9) { reboot_type = BOOT_CF9; pr_info("%s series board detected. Selecting %s-method for reboots.\n", - "PCI", d->ident); + d->ident, "PCI"); } return 0; } @@ -127,7 +127,7 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d) if (reboot_type != BOOT_KBD) { reboot_type = BOOT_KBD; pr_info("%s series board detected. Selecting %s-method for reboot.\n", - "KBD", d->ident); + d->ident, "KBD"); } return 0; } @@ -136,228 +136,256 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d) * This is a single dmi_table handling all reboot quirks. */ static struct dmi_system_id __initdata reboot_dmi_table[] = { - { /* Handle problems with rebooting on Dell E520's */ - .callback = set_bios_reboot, - .ident = "Dell E520", + + /* Acer */ + { /* Handle reboot issue on Acer Aspire one */ + .callback = set_kbd_reboot, + .ident = "Acer Aspire One A110", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"), + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), }, }, - { /* Handle problems with rebooting on Dell 1300's */ - .callback = set_bios_reboot, - .ident = "Dell PowerEdge 1300", + + /* Apple */ + { /* Handle problems with rebooting on Apple MacBook5 */ + .callback = set_pci_reboot, + .ident = "Apple MacBook5", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), - DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"), + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), }, }, - { /* Handle problems with rebooting on Dell 300's */ - .callback = set_bios_reboot, - .ident = "Dell PowerEdge 300", + { /* Handle problems with rebooting on Apple MacBookPro5 */ + .callback = set_pci_reboot, + .ident = "Apple MacBookPro5", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), - DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 745's SFF */ - .callback = set_bios_reboot, - .ident = "Dell OptiPlex 745", + { /* Handle problems with rebooting on Apple Macmini3,1 */ + .callback = set_pci_reboot, + .ident = "Apple Macmini3,1", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 745's DFF */ + { /* Handle problems with rebooting on the iMac9,1. */ + .callback = set_pci_reboot, + .ident = "Apple iMac9,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), + }, + }, + + /* ASUS */ + { /* Handle problems with rebooting on ASUS P4S800 */ .callback = set_bios_reboot, - .ident = "Dell OptiPlex 745", + .ident = "ASUS P4S800", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), - DMI_MATCH(DMI_BOARD_NAME, "0MM599"), + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ + + /* Dell */ + { /* Handle problems with rebooting on Dell DXP061 */ .callback = set_bios_reboot, - .ident = "Dell OptiPlex 745", + .ident = "Dell DXP061", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), - DMI_MATCH(DMI_BOARD_NAME, "0KW626"), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ + { /* Handle problems with rebooting on Dell E520's */ .callback = set_bios_reboot, - .ident = "Dell OptiPlex 330", + .ident = "Dell E520", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"), - DMI_MATCH(DMI_BOARD_NAME, "0KP561"), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ - .callback = set_bios_reboot, - .ident = "Dell OptiPlex 360", + { /* Handle problems with rebooting on the Latitude E5410. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E5410", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"), - DMI_MATCH(DMI_BOARD_NAME, "0T656F"), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5410"), }, }, - { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */ - .callback = set_bios_reboot, - .ident = "Dell OptiPlex 760", + { /* Handle problems with rebooting on the Latitude E5420. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E5420", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"), - DMI_MATCH(DMI_BOARD_NAME, "0G919G"), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"), }, }, - { /* Handle problems with rebooting on Dell 2400's */ - .callback = set_bios_reboot, - .ident = "Dell PowerEdge 2400", + { /* Handle problems with rebooting on the Latitude E6320. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6320", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), - DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"), }, }, - { /* Handle problems with rebooting on Dell T5400's */ - .callback = set_bios_reboot, - .ident = "Dell Precision T5400", + { /* Handle problems with rebooting on the Latitude E6420. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6420", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"), }, }, - { /* Handle problems with rebooting on Dell T7400's */ + { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ .callback = set_bios_reboot, - .ident = "Dell Precision T7400", + .ident = "Dell OptiPlex 330", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"), + DMI_MATCH(DMI_BOARD_NAME, "0KP561"), }, }, - { /* Handle problems with rebooting on HP laptops */ + { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ .callback = set_bios_reboot, - .ident = "HP Compaq Laptop", + .ident = "Dell OptiPlex 360", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), - DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"), + DMI_MATCH(DMI_BOARD_NAME, "0T656F"), }, }, - { /* Handle problems with rebooting on Dell XPS710 */ + { /* Handle problems with rebooting on Dell Optiplex 745's SFF */ .callback = set_bios_reboot, - .ident = "Dell XPS710", + .ident = "Dell OptiPlex 745", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), }, }, - { /* Handle problems with rebooting on Dell DXP061 */ + { /* Handle problems with rebooting on Dell Optiplex 745's DFF */ .callback = set_bios_reboot, - .ident = "Dell DXP061", + .ident = "Dell OptiPlex 745", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), + DMI_MATCH(DMI_BOARD_NAME, "0MM599"), }, }, - { /* Handle problems with rebooting on Sony VGN-Z540N */ + { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ .callback = set_bios_reboot, - .ident = "Sony VGN-Z540N", + .ident = "Dell OptiPlex 745", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"), - DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), + DMI_MATCH(DMI_BOARD_NAME, "0KW626"), }, }, - { /* Handle problems with rebooting on ASUS P4S800 */ + { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */ .callback = set_bios_reboot, - .ident = "ASUS P4S800", + .ident = "Dell OptiPlex 760", .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - DMI_MATCH(DMI_BOARD_NAME, "P4S800"), + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"), + DMI_MATCH(DMI_BOARD_NAME, "0G919G"), }, }, - - { /* Handle reboot issue on Acer Aspire one */ - .callback = set_kbd_reboot, - .ident = "Acer Aspire One A110", + { /* Handle problems with rebooting on the OptiPlex 990. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 990", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Acer"), - DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), }, }, - { /* Handle problems with rebooting on Apple MacBook5 */ - .callback = set_pci_reboot, - .ident = "Apple MacBook5", + { /* Handle problems with rebooting on Dell 300's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 300", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), }, }, - { /* Handle problems with rebooting on Apple MacBookPro5 */ - .callback = set_pci_reboot, - .ident = "Apple MacBookPro5", + { /* Handle problems with rebooting on Dell 1300's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 1300", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"), }, }, - { /* Handle problems with rebooting on Apple Macmini3,1 */ - .callback = set_pci_reboot, - .ident = "Apple Macmini3,1", + { /* Handle problems with rebooting on Dell 2400's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 2400", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), }, }, - { /* Handle problems with rebooting on the iMac9,1. */ + { /* Handle problems with rebooting on the Dell PowerEdge C6100. */ .callback = set_pci_reboot, - .ident = "Apple iMac9,1", + .ident = "Dell PowerEdge C6100", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), + DMI_MATCH(DMI_SYS_VENDOR, "Dell"), + DMI_MATCH(DMI_PRODUCT_NAME, "C6100"), }, }, - { /* Handle problems with rebooting on the Latitude E6320. */ + { /* Handle problems with rebooting on the Precision M6600. */ .callback = set_pci_reboot, - .ident = "Dell Latitude E6320", + .ident = "Dell Precision M6600", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"), }, }, - { /* Handle problems with rebooting on the Latitude E5420. */ - .callback = set_pci_reboot, - .ident = "Dell Latitude E5420", + { /* Handle problems with rebooting on Dell T5400's */ + .callback = set_bios_reboot, + .ident = "Dell Precision T5400", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), }, }, - { /* Handle problems with rebooting on the Latitude E6420. */ - .callback = set_pci_reboot, - .ident = "Dell Latitude E6420", + { /* Handle problems with rebooting on Dell T7400's */ + .callback = set_bios_reboot, + .ident = "Dell Precision T7400", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"), }, }, - { /* Handle problems with rebooting on the OptiPlex 990. */ - .callback = set_pci_reboot, - .ident = "Dell OptiPlex 990", + { /* Handle problems with rebooting on Dell XPS710 */ + .callback = set_bios_reboot, + .ident = "Dell XPS710", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), }, }, - { /* Handle problems with rebooting on the Precision M6600. */ - .callback = set_pci_reboot, - .ident = "Dell OptiPlex 990", + + /* Hewlett-Packard */ + { /* Handle problems with rebooting on HP laptops */ + .callback = set_bios_reboot, + .ident = "HP Compaq Laptop", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"), + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), + }, + }, + + /* Sony */ + { /* Handle problems with rebooting on Sony VGN-Z540N */ + .callback = set_bios_reboot, + .ident = "Sony VGN-Z540N", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), }, }, + { } }; @@ -511,10 +539,13 @@ static void native_machine_emergency_restart(void) case BOOT_CF9_COND: if (port_cf9_safe) { - u8 cf9 = inb(0xcf9) & ~6; + u8 reboot_code = reboot_mode == REBOOT_WARM ? + 0x06 : 0x0E; + u8 cf9 = inb(0xcf9) & ~reboot_code; outb(cf9|2, 0xcf9); /* Request hard reset */ udelay(50); - outb(cf9|6, 0xcf9); /* Actually do the reset */ + /* Actually do the reset */ + outb(cf9|reboot_code, 0xcf9); udelay(50); } reboot_type = BOOT_KBD; @@ -526,6 +557,10 @@ static void native_machine_emergency_restart(void) void native_machine_shutdown(void) { /* Stop the cpus and apics */ +#ifdef CONFIG_X86_IO_APIC + disable_IO_APIC(); +#endif + #ifdef CONFIG_SMP /* * Stop all of the others. Also disable the local irq to @@ -538,10 +573,6 @@ void native_machine_shutdown(void) lapic_shutdown(); -#ifdef CONFIG_X86_IO_APIC - disable_IO_APIC(); -#endif - #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 0aa29394ed6..ca9622a25e9 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -12,7 +12,7 @@ #include <asm/vsyscall.h> #include <asm/x86_init.h> #include <asm/time.h> -#include <asm/mrst.h> +#include <asm/intel-mid.h> #include <asm/rtc.h> #ifdef CONFIG_X86_32 @@ -189,9 +189,17 @@ static __init int add_rtc_cmos(void) return 0; /* Intel MID platforms don't have ioport rtc */ - if (mrst_identify_cpu()) + if (intel_mid_identify_cpu()) return -ENODEV; +#ifdef CONFIG_ACPI + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { + /* This warning can likely go away again in a year or two. */ + pr_info("ACPI: not registering RTC platform device\n"); + return -ENODEV; + } +#endif + platform_device_register(&rtc_device); dev_info(&rtc_device.dev, "registered platform RTC device (no PNP device found)\n"); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f8ec57815c0..918d489fa53 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -206,9 +206,9 @@ EXPORT_SYMBOL(boot_cpu_data); #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -unsigned long mmu_cr4_features; +__visible unsigned long mmu_cr4_features; #else -unsigned long mmu_cr4_features = X86_CR4_PAE; +__visible unsigned long mmu_cr4_features = X86_CR4_PAE; #endif /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ @@ -426,25 +426,23 @@ static void __init reserve_initrd(void) static void __init parse_setup_data(void) { struct setup_data *data; - u64 pa_data; + u64 pa_data, pa_next; pa_data = boot_params.hdr.setup_data; while (pa_data) { - u32 data_len, map_len; + u32 data_len, map_len, data_type; map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK), (u64)sizeof(struct setup_data)); data = early_memremap(pa_data, map_len); data_len = data->len + sizeof(struct setup_data); - if (data_len > map_len) { - early_iounmap(data, map_len); - data = early_memremap(pa_data, data_len); - map_len = data_len; - } + data_type = data->type; + pa_next = data->next; + early_iounmap(data, map_len); - switch (data->type) { + switch (data_type) { case SETUP_E820_EXT: - parse_e820_ext(data); + parse_e820_ext(pa_data, data_len); break; case SETUP_DTB: add_dtb(pa_data); @@ -452,8 +450,7 @@ static void __init parse_setup_data(void) default: break; } - pa_data = data->next; - early_iounmap(data, map_len); + pa_data = pa_next; } } @@ -996,6 +993,7 @@ void __init setup_arch(char **cmdline_p) efi_init(); dmi_scan_machine(); + dmi_memdev_walk(); dmi_set_dump_stack_arch_desc(); /* @@ -1070,7 +1068,7 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); - memblock.current_limit = ISA_END_ADDRESS; + memblock_set_current_limit(ISA_END_ADDRESS); memblock_x86_fill(); /* @@ -1103,7 +1101,7 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); - memblock.current_limit = get_max_mapped(); + memblock_set_current_limit(get_max_mapped()); dma_contiguous_reserve(0); /* diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cf913587d4d..9e5de6813e1 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -358,7 +358,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - err |= __save_altstack(&frame->uc.uc_stack, regs->sp); + save_altstack_ex(&frame->uc.uc_stack, regs->sp); /* Set up to return from userspace. */ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); @@ -423,7 +423,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - err |= __save_altstack(&frame->uc.uc_stack, regs->sp); + save_altstack_ex(&frame->uc.uc_stack, regs->sp); /* Set up to return from userspace. If provided, use a stub already in userspace. */ @@ -490,7 +490,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp); + compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); if (ksig->ka.sa.sa_flags & SA_RESTORER) { @@ -533,7 +533,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, * Do a signal return; undo the signal stack. */ #ifdef CONFIG_X86_32 -unsigned long sys_sigreturn(void) +asmlinkage unsigned long sys_sigreturn(void) { struct pt_regs *regs = current_pt_regs(); struct sigframe __user *frame; @@ -562,7 +562,7 @@ badframe: } #endif /* CONFIG_X86_32 */ -long sys_rt_sigreturn(void) +asmlinkage long sys_rt_sigreturn(void) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; @@ -728,7 +728,7 @@ static void do_signal(struct pt_regs *regs) * notification of userspace execution resumption * - triggered by the TIF_WORK_MASK flags */ -void +__visible void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { user_exit(); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index cdaa347dfca..7c3a5a61f2e 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -256,7 +256,7 @@ static inline void __smp_reschedule_interrupt(void) scheduler_ipi(); } -void smp_reschedule_interrupt(struct pt_regs *regs) +__visible void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); __smp_reschedule_interrupt(); @@ -271,7 +271,7 @@ static inline void smp_entering_irq(void) irq_enter(); } -void smp_trace_reschedule_interrupt(struct pt_regs *regs) +__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) { /* * Need to call irq_enter() before calling the trace point. @@ -295,14 +295,14 @@ static inline void __smp_call_function_interrupt(void) inc_irq_stat(irq_call_count); } -void smp_call_function_interrupt(struct pt_regs *regs) +__visible void smp_call_function_interrupt(struct pt_regs *regs) { smp_entering_irq(); __smp_call_function_interrupt(); exiting_irq(); } -void smp_trace_call_function_interrupt(struct pt_regs *regs) +__visible void smp_trace_call_function_interrupt(struct pt_regs *regs) { smp_entering_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); @@ -317,14 +317,14 @@ static inline void __smp_call_function_single_interrupt(void) inc_irq_stat(irq_call_count); } -void smp_call_function_single_interrupt(struct pt_regs *regs) +__visible void smp_call_function_single_interrupt(struct pt_regs *regs) { smp_entering_irq(); __smp_call_function_single_interrupt(); exiting_irq(); } -void smp_trace_call_function_single_interrupt(struct pt_regs *regs) +__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs) { smp_entering_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index aecc98a93d1..2a165580fa1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -73,11 +73,10 @@ #include <asm/setup.h> #include <asm/uv/uv.h> #include <linux/mc146818rtc.h> - #include <asm/smpboot_hooks.h> #include <asm/i8259.h> - #include <asm/realmode.h> +#include <asm/misc.h> /* State of each CPU */ DEFINE_PER_CPU(int, cpu_state) = { 0 }; @@ -648,21 +647,46 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } +void smp_announce(void) +{ + int num_nodes = num_online_nodes(); + + printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n", + num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus()); +} + /* reduce the number of lines printed when booting a large cpu count system */ static void announce_cpu(int cpu, int apicid) { static int current_node = -1; int node = early_cpu_to_node(cpu); + static int width, node_width; + + if (!width) + width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */ + + if (!node_width) + node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */ + + if (cpu == 1) + printk(KERN_INFO "x86: Booting SMP configuration:\n"); if (system_state == SYSTEM_BOOTING) { if (node != current_node) { if (current_node > (-1)) - pr_cont(" OK\n"); + pr_cont("\n"); current_node = node; - pr_info("Booting Node %3d, Processors ", node); + + printk(KERN_INFO ".... node %*s#%d, CPUs: ", + node_width - num_digits(node), " ", node); } - pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : ""); - return; + + /* Add padding for the BSP */ + if (cpu == 1) + pr_cont("%*s", width + 1, " "); + + pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu); + } else pr_info("Booting Node %d Processor %d APIC 0x%x\n", node, cpu, apicid); diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c index 147fcd4941c..e9bcd57d8a9 100644 --- a/arch/x86/kernel/syscall_32.c +++ b/arch/x86/kernel/syscall_32.c @@ -15,7 +15,7 @@ typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { +__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 5c7f8c20da7..4ac730b37f0 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -4,6 +4,7 @@ #include <linux/sys.h> #include <linux/cache.h> #include <asm/asm-offsets.h> +#include <asm/syscall.h> #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) @@ -19,11 +20,9 @@ #define __SYSCALL_64(nr, sym, compat) [nr] = sym, -typedef void (*sys_call_ptr_t)(void); - extern void sys_ni_syscall(void); -const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { +asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c new file mode 100644 index 00000000000..193ec2ce46c --- /dev/null +++ b/arch/x86/kernel/sysfb.c @@ -0,0 +1,74 @@ +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +/* + * Simple-Framebuffer support for x86 systems + * Create a platform-device for any available boot framebuffer. The + * simple-framebuffer platform device is already available on DT systems, so + * this module parses the global "screen_info" object and creates a suitable + * platform device compatible with the "simple-framebuffer" DT object. If + * the framebuffer is incompatible, we instead create a legacy + * "vesa-framebuffer", "efi-framebuffer" or "platform-framebuffer" device and + * pass the screen_info as platform_data. This allows legacy drivers + * to pick these devices up without messing with simple-framebuffer drivers. + * The global "screen_info" is still valid at all times. + * + * If CONFIG_X86_SYSFB is not selected, we never register "simple-framebuffer" + * platform devices, but only use legacy framebuffer devices for + * backwards compatibility. + * + * TODO: We set the dev_id field of all platform-devices to 0. This allows + * other x86 OF/DT parsers to create such devices, too. However, they must + * start at offset 1 for this to work. + */ + +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/platform_data/simplefb.h> +#include <linux/platform_device.h> +#include <linux/screen_info.h> +#include <asm/sysfb.h> + +static __init int sysfb_init(void) +{ + struct screen_info *si = &screen_info; + struct simplefb_platform_data mode; + struct platform_device *pd; + const char *name; + bool compatible; + int ret; + + sysfb_apply_efi_quirks(); + + /* try to create a simple-framebuffer device */ + compatible = parse_mode(si, &mode); + if (compatible) { + ret = create_simplefb(si, &mode); + if (!ret) + return 0; + } + + /* if the FB is incompatible, create a legacy framebuffer device */ + if (si->orig_video_isVGA == VIDEO_TYPE_EFI) + name = "efi-framebuffer"; + else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB) + name = "vesa-framebuffer"; + else + name = "platform-framebuffer"; + + pd = platform_device_register_resndata(NULL, name, 0, + NULL, 0, si, sizeof(*si)); + return IS_ERR(pd) ? PTR_ERR(pd) : 0; +} + +/* must execute after PCI subsystem for EFI quirks */ +device_initcall(sysfb_init); diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c new file mode 100644 index 00000000000..b285d4e8c68 --- /dev/null +++ b/arch/x86/kernel/sysfb_efi.c @@ -0,0 +1,214 @@ +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * EFI Quirks Copyright (c) 2006 Edgar Hucek <gimli@dark-green.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +/* + * EFI Quirks + * Several EFI systems do not correctly advertise their boot framebuffers. + * Hence, we use this static table of known broken machines and fix up the + * information so framebuffer drivers can load corectly. + */ + +#include <linux/dmi.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/pci.h> +#include <linux/screen_info.h> +#include <video/vga.h> +#include <asm/sysfb.h> + +enum { + OVERRIDE_NONE = 0x0, + OVERRIDE_BASE = 0x1, + OVERRIDE_STRIDE = 0x2, + OVERRIDE_HEIGHT = 0x4, + OVERRIDE_WIDTH = 0x8, +}; + +struct efifb_dmi_info efifb_dmi_list[] = { + [M_I17] = { "i17", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_I20] = { "i20", 0x80010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, /* guess */ + [M_I20_SR] = { "imac7", 0x40010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, + [M_I24] = { "i24", 0x80010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, /* guess */ + [M_I24_8_1] = { "imac8", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_I24_10_1] = { "imac10", 0xc0010000, 2048 * 4, 1920, 1080, OVERRIDE_NONE }, + [M_I27_11_1] = { "imac11", 0xc0010000, 2560 * 4, 2560, 1440, OVERRIDE_NONE }, + [M_MINI]= { "mini", 0x80000000, 2048 * 4, 1024, 768, OVERRIDE_NONE }, + [M_MINI_3_1] = { "mini31", 0x40010000, 1024 * 4, 1024, 768, OVERRIDE_NONE }, + [M_MINI_4_1] = { "mini41", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MB] = { "macbook", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MB_5_1] = { "macbook51", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MB_6_1] = { "macbook61", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MB_7_1] = { "macbook71", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MBA] = { "mba", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + /* 11" Macbook Air 3,1 passes the wrong stride */ + [M_MBA_3] = { "mba3", 0, 2048 * 4, 0, 0, OVERRIDE_STRIDE }, + [M_MBP] = { "mbp", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_2] = { "mbp2", 0, 0, 0, 0, OVERRIDE_NONE }, /* placeholder */ + [M_MBP_2_2] = { "mbp22", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_SR] = { "mbp3", 0x80030000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_4] = { "mbp4", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MBP_5_1] = { "mbp51", 0xc0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_5_2] = { "mbp52", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MBP_5_3] = { "mbp53", 0xd0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_6_1] = { "mbp61", 0x90030000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MBP_6_2] = { "mbp62", 0x90030000, 2048 * 4, 1680, 1050, OVERRIDE_NONE }, + [M_MBP_7_1] = { "mbp71", 0xc0010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MBP_8_2] = { "mbp82", 0x90010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE } +}; + +#define choose_value(dmivalue, fwvalue, field, flags) ({ \ + typeof(fwvalue) _ret_ = fwvalue; \ + if ((flags) & (field)) \ + _ret_ = dmivalue; \ + else if ((fwvalue) == 0) \ + _ret_ = dmivalue; \ + _ret_; \ + }) + +static int __init efifb_set_system(const struct dmi_system_id *id) +{ + struct efifb_dmi_info *info = id->driver_data; + + if (info->base == 0 && info->height == 0 && info->width == 0 && + info->stride == 0) + return 0; + + /* Trust the bootloader over the DMI tables */ + if (screen_info.lfb_base == 0) { +#if defined(CONFIG_PCI) + struct pci_dev *dev = NULL; + int found_bar = 0; +#endif + if (info->base) { + screen_info.lfb_base = choose_value(info->base, + screen_info.lfb_base, OVERRIDE_BASE, + info->flags); + +#if defined(CONFIG_PCI) + /* make sure that the address in the table is actually + * on a VGA device's PCI BAR */ + + for_each_pci_dev(dev) { + int i; + if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) + continue; + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { + resource_size_t start, end; + + start = pci_resource_start(dev, i); + if (start == 0) + break; + end = pci_resource_end(dev, i); + if (screen_info.lfb_base >= start && + screen_info.lfb_base < end) { + found_bar = 1; + } + } + } + if (!found_bar) + screen_info.lfb_base = 0; +#endif + } + } + if (screen_info.lfb_base) { + screen_info.lfb_linelength = choose_value(info->stride, + screen_info.lfb_linelength, OVERRIDE_STRIDE, + info->flags); + screen_info.lfb_width = choose_value(info->width, + screen_info.lfb_width, OVERRIDE_WIDTH, + info->flags); + screen_info.lfb_height = choose_value(info->height, + screen_info.lfb_height, OVERRIDE_HEIGHT, + info->flags); + if (screen_info.orig_video_isVGA == 0) + screen_info.orig_video_isVGA = VIDEO_TYPE_EFI; + } else { + screen_info.lfb_linelength = 0; + screen_info.lfb_width = 0; + screen_info.lfb_height = 0; + screen_info.orig_video_isVGA = 0; + return 0; + } + + printk(KERN_INFO "efifb: dmi detected %s - framebuffer at 0x%08x " + "(%dx%d, stride %d)\n", id->ident, + screen_info.lfb_base, screen_info.lfb_width, + screen_info.lfb_height, screen_info.lfb_linelength); + + return 1; +} + +#define EFIFB_DMI_SYSTEM_ID(vendor, name, enumid) \ + { \ + efifb_set_system, \ + name, \ + { \ + DMI_MATCH(DMI_BIOS_VENDOR, vendor), \ + DMI_MATCH(DMI_PRODUCT_NAME, name) \ + }, \ + &efifb_dmi_list[enumid] \ + } + +static const struct dmi_system_id efifb_dmi_system_table[] __initconst = { + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac4,1", M_I17), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac5,1", M_I20), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac5,1", M_I20), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac6,1", M_I24), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac6,1", M_I24), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac7,1", M_I20_SR), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac8,1", M_I24_8_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac10,1", M_I24_10_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac11,1", M_I27_11_1), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "Macmini1,1", M_MINI), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini3,1", M_MINI_3_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini4,1", M_MINI_4_1), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook1,1", M_MB), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook2,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook2,1", M_MB), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook3,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook3,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook4,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook5,1", M_MB_5_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook6,1", M_MB_6_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook7,1", M_MB_7_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir1,1", M_MBA), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir3,1", M_MBA_3), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro1,1", M_MBP), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,1", M_MBP_2), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,2", M_MBP_2_2), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro2,1", M_MBP_2), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro3,1", M_MBP_SR), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro3,1", M_MBP_SR), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro4,1", M_MBP_4), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,1", M_MBP_5_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,2", M_MBP_5_2), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,3", M_MBP_5_3), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,1", M_MBP_6_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,2", M_MBP_6_2), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro7,1", M_MBP_7_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro8,2", M_MBP_8_2), + {}, +}; + +__init void sysfb_apply_efi_quirks(void) +{ + if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI || + !(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS)) + dmi_check_system(efifb_dmi_system_table); +} diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c new file mode 100644 index 00000000000..86179d40989 --- /dev/null +++ b/arch/x86/kernel/sysfb_simplefb.c @@ -0,0 +1,95 @@ +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +/* + * simple-framebuffer probing + * Try to convert "screen_info" into a "simple-framebuffer" compatible mode. + * If the mode is incompatible, we return "false" and let the caller create + * legacy nodes instead. + */ + +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/platform_data/simplefb.h> +#include <linux/platform_device.h> +#include <linux/screen_info.h> +#include <asm/sysfb.h> + +static const char simplefb_resname[] = "BOOTFB"; +static const struct simplefb_format formats[] = SIMPLEFB_FORMATS; + +/* try parsing x86 screen_info into a simple-framebuffer mode struct */ +__init bool parse_mode(const struct screen_info *si, + struct simplefb_platform_data *mode) +{ + const struct simplefb_format *f; + __u8 type; + unsigned int i; + + type = si->orig_video_isVGA; + if (type != VIDEO_TYPE_VLFB && type != VIDEO_TYPE_EFI) + return false; + + for (i = 0; i < ARRAY_SIZE(formats); ++i) { + f = &formats[i]; + if (si->lfb_depth == f->bits_per_pixel && + si->red_size == f->red.length && + si->red_pos == f->red.offset && + si->green_size == f->green.length && + si->green_pos == f->green.offset && + si->blue_size == f->blue.length && + si->blue_pos == f->blue.offset && + si->rsvd_size == f->transp.length && + si->rsvd_pos == f->transp.offset) { + mode->format = f->name; + mode->width = si->lfb_width; + mode->height = si->lfb_height; + mode->stride = si->lfb_linelength; + return true; + } + } + + return false; +} + +__init int create_simplefb(const struct screen_info *si, + const struct simplefb_platform_data *mode) +{ + struct platform_device *pd; + struct resource res; + unsigned long len; + + /* don't use lfb_size as it may contain the whole VMEM instead of only + * the part that is occupied by the framebuffer */ + len = mode->height * mode->stride; + len = PAGE_ALIGN(len); + if (len > (u64)si->lfb_size << 16) { + printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n"); + return -EINVAL; + } + + /* setup IORESOURCE_MEM as framebuffer memory */ + memset(&res, 0, sizeof(res)); + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res.name = simplefb_resname; + res.start = si->lfb_base; + res.end = si->lfb_base + len - 1; + if (res.end <= res.start) + return -EINVAL; + + pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0, + &res, 1, mode, sizeof(*mode)); + if (IS_ERR(pd)) + return PTR_ERR(pd); + + return 0; +} diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index addf7b58f4e..91a4496db43 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -301,6 +301,15 @@ static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) return 0; } +static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b) +{ + if (!tboot_enabled()) + return 0; + + pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)"); + return -ENODEV; +} + static atomic_t ap_wfs_count; static int tboot_wait_for_aps(int num_aps) @@ -422,6 +431,7 @@ static __init int tboot_late_init(void) #endif acpi_os_set_prepare_sleep(&tboot_sleep); + acpi_os_set_prepare_extended_sleep(&tboot_extended_sleep); return 0; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1b23a1c9274..729aa779ff7 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -58,6 +58,7 @@ #include <asm/mce.h> #include <asm/fixmap.h> #include <asm/mach_traps.h> +#include <asm/alternative.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -87,7 +88,7 @@ static inline void conditional_sti(struct pt_regs *regs) static inline void preempt_conditional_sti(struct pt_regs *regs) { - inc_preempt_count(); + preempt_count_inc(); if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } @@ -102,7 +103,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); - dec_preempt_count(); + preempt_count_dec(); } static int __kprobes @@ -327,6 +328,9 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co ftrace_int3_handler(regs)) return; #endif + if (poke_int3_handler(regs)) + return; + prev_state = exception_enter(); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6ff49247edf..930e5d48f56 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -89,6 +89,12 @@ int check_tsc_unstable(void) } EXPORT_SYMBOL_GPL(check_tsc_unstable); +int check_tsc_disabled(void) +{ + return tsc_disabled; +} +EXPORT_SYMBOL_GPL(check_tsc_disabled); + #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 10c4f3006af..da6b35a9826 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -199,6 +199,15 @@ SECTIONS __x86_cpu_dev_end = .; } +#ifdef CONFIG_X86_INTEL_MID + .x86_intel_mid_dev.init : AT(ADDR(.x86_intel_mid_dev.init) - \ + LOAD_OFFSET) { + __x86_intel_mid_dev_start = .; + *(.x86_intel_mid_dev.init) + __x86_intel_mid_dev_end = .; + } +#endif + /* * start address and size of operations which during runtime * can be patched with virtualization friendly instructions or diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index b014d9414d0..040681928e9 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -66,3 +66,10 @@ EXPORT_SYMBOL(empty_zero_page); #ifndef CONFIG_PARAVIRT EXPORT_SYMBOL(native_load_gs_index); #endif + +#ifdef CONFIG_PREEMPT +EXPORT_SYMBOL(___preempt_schedule); +#ifdef CONFIG_CONTEXT_TRACKING +EXPORT_SYMBOL(___preempt_schedule_context); +#endif +#endif diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 5f24c71acca..8ce0072cd70 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -107,6 +107,8 @@ struct x86_platform_ops x86_platform = { }; EXPORT_SYMBOL_GPL(x86_platform); + +#if defined(CONFIG_PCI_MSI) struct x86_msi_ops x86_msi = { .setup_msi_irqs = native_setup_msi_irqs, .compose_msi_msg = native_compose_msi_msg, @@ -116,6 +118,28 @@ struct x86_msi_ops x86_msi = { .setup_hpet_msi = default_setup_hpet_msi, }; +/* MSI arch specific hooks */ +int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + return x86_msi.setup_msi_irqs(dev, nvec, type); +} + +void arch_teardown_msi_irqs(struct pci_dev *dev) +{ + x86_msi.teardown_msi_irqs(dev); +} + +void arch_teardown_msi_irq(unsigned int irq) +{ + x86_msi.teardown_msi_irq(irq); +} + +void arch_restore_msi_irqs(struct pci_dev *dev, int irq) +{ + x86_msi.restore_msi_irqs(dev, irq); +} +#endif + struct x86_io_apic_ops x86_io_apic_ops = { .init = native_io_apic_init_mappings, .read = native_io_apic_read, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a20ecb5b6cb..b110fe6c03d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -413,7 +413,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_PV_EOI) | - (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | + (1 << KVM_FEATURE_PV_UNHALT); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2bc1e81045b..ddc3f3d2afd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2025,6 +2025,17 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) return rc; } +static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt) +{ + int rc; + + rc = em_ret_far(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + rsp_increment(ctxt, ctxt->src.val); + return X86EMUL_CONTINUE; +} + static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) { /* Save real source value, then compare EAX against destination. */ @@ -3763,7 +3774,8 @@ static const struct opcode opcode_table[256] = { G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), - N, I(ImplicitOps | Stack, em_ret_far), + I(ImplicitOps | Stack | SrcImmU16, em_ret_far_imm), + I(ImplicitOps | Stack, em_ret_far), D(ImplicitOps), DI(SrcImmByte, intn), D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), /* 0xD0 - 0xD7 */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index afc11245827..5439117d5c4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -79,16 +79,6 @@ static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) *((u32 *) (apic->regs + reg_off)) = val; } -static inline int apic_test_and_set_vector(int vec, void *bitmap) -{ - return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - -static inline int apic_test_and_clear_vector(int vec, void *bitmap) -{ - return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - static inline int apic_test_vector(int vec, void *bitmap) { return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -331,10 +321,10 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); -static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) +static inline void apic_set_irr(int vec, struct kvm_lapic *apic) { apic->irr_pending = true; - return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); + apic_set_vector(vec, apic->regs + APIC_IRR); } static inline int apic_search_irr(struct kvm_lapic *apic) @@ -681,32 +671,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (unlikely(!apic_enabled(apic))) break; + result = 1; + if (dest_map) __set_bit(vcpu->vcpu_id, dest_map); - if (kvm_x86_ops->deliver_posted_interrupt) { - result = 1; + if (kvm_x86_ops->deliver_posted_interrupt) kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); - } else { - result = !apic_test_and_set_irr(vector, apic); - - if (!result) { - if (trig_mode) - apic_debug("level trig mode repeatedly " - "for vector %d", vector); - goto out; - } + else { + apic_set_irr(vector, apic); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } -out: trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector, !result); + trig_mode, vector, false); break; case APIC_DM_REMRD: - apic_debug("Ignoring delivery mode 3\n"); + result = 1; + vcpu->arch.pv.pv_unhalted = 1; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); break; case APIC_DM_SMI: diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9e9285ae9b9..dce0df8150d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -132,8 +132,8 @@ module_param(dbg, bool, 0644); (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ - | PT64_NX_MASK) +#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ + | shadow_x_mask | shadow_nx_mask) #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK @@ -331,11 +331,6 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_dirty_gpte(unsigned long pte) -{ - return pte & PT_DIRTY_MASK; -} - static int is_rmap_spte(u64 pte) { return is_shadow_present_pte(pte); @@ -2052,12 +2047,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) return __shadow_walk_next(iterator, *iterator->sptep); } -static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) +static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed) { u64 spte; + BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK || + VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); + spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_accessed_mask; + shadow_user_mask | shadow_x_mask; + + if (accessed) + spte |= shadow_accessed_mask; mmu_spte_set(sptep, spte); } @@ -2574,14 +2575,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) -{ - int bit7; - - bit7 = (gpte >> 7) & 1; - return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; -} - static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { @@ -2594,26 +2587,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, return gfn_to_pfn_memslot_atomic(slot, gfn); } -static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - u64 gpte) -{ - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; - - if (!is_present_gpte(gpte)) - goto no_present; - - if (!(gpte & PT_ACCESSED_MASK)) - goto no_present; - - return false; - -no_present: - drop_spte(vcpu->kvm, spte); - return true; -} - static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) @@ -2710,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, iterator.level - 1, 1, ACC_ALL, iterator.sptep); - link_shadow_page(iterator.sptep, sp); + link_shadow_page(iterator.sptep, sp, true); } } return emulate; @@ -2808,7 +2781,7 @@ exit: return ret; } -static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) +static bool page_fault_can_be_fast(u32 error_code) { /* * Do not fix the mmio spte with invalid generation number which @@ -2861,7 +2834,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, bool ret = false; u64 spte = 0ull; - if (!page_fault_can_be_fast(vcpu, error_code)) + if (!page_fault_can_be_fast(error_code)) return false; walk_shadow_page_lockless_begin(vcpu); @@ -3209,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); } +EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -3478,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) ++vcpu->stat.tlb_flush; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } +EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb); static void paging_new_cr3(struct kvm_vcpu *vcpu) { @@ -3501,18 +3476,6 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } -static inline void protect_clean_gpte(unsigned *access, unsigned gpte) -{ - unsigned mask; - - BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); - - mask = (unsigned)~ACC_WRITE_MASK; - /* Allow write access to dirty gptes */ - mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK; - *access &= mask; -} - static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, unsigned access, int *nr_present) { @@ -3530,16 +3493,6 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, return false; } -static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte) -{ - unsigned access; - - access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; - access &= ~(gpte >> PT64_NX_SHIFT); - - return access; -} - static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) { unsigned index; @@ -3549,6 +3502,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp return mmu->last_pte_bitmap & (1 << index); } +#define PTTYPE_EPT 18 /* arbitrary */ +#define PTTYPE PTTYPE_EPT +#include "paging_tmpl.h" +#undef PTTYPE + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE @@ -3563,6 +3521,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; + context->bad_mt_xwr = 0; + if (!context->nx) exb_bit_rsvd = rsvd_bits(63, 63); switch (context->root_level) { @@ -3618,7 +3578,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, } } -static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, bool execonly) +{ + int maxphyaddr = cpuid_maxphyaddr(vcpu); + int pte; + + context->rsvd_bits_mask[0][3] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); + context->rsvd_bits_mask[0][2] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); + context->rsvd_bits_mask[0][1] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); + context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); + + /* large page */ + context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; + context->rsvd_bits_mask[1][2] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); + context->rsvd_bits_mask[1][1] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + + for (pte = 0; pte < 64; pte++) { + int rwx_bits = pte & 7; + int mt = pte >> 3; + if (mt == 0x2 || mt == 0x3 || mt == 0x7 || + rwx_bits == 0x2 || rwx_bits == 0x6 || + (rwx_bits == 0x4 && !execonly)) + context->bad_mt_xwr |= (1ull << pte); + } +} + +static void update_permission_bitmask(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu, bool ept) { unsigned bit, byte, pfec; u8 map; @@ -3636,12 +3629,16 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu w = bit & ACC_WRITE_MASK; u = bit & ACC_USER_MASK; - /* Not really needed: !nx will cause pte.nx to fault */ - x |= !mmu->nx; - /* Allow supervisor writes if !cr0.wp */ - w |= !is_write_protection(vcpu) && !uf; - /* Disallow supervisor fetches of user code if cr4.smep */ - x &= !(smep && u && !uf); + if (!ept) { + /* Not really needed: !nx will cause pte.nx to fault */ + x |= !mmu->nx; + /* Allow supervisor writes if !cr0.wp */ + w |= !is_write_protection(vcpu) && !uf; + /* Disallow supervisor fetches of user code if cr4.smep */ + x &= !(smep && u && !uf); + } else + /* Not really needed: no U/S accesses on ept */ + u = 1; fault = (ff && !x) || (uf && !u) || (wf && !w); map |= fault << bit; @@ -3676,7 +3673,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, context->root_level = level; reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context); + update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); ASSERT(is_pae(vcpu)); @@ -3706,7 +3703,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, context->root_level = PT32_ROOT_LEVEL; reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context); + update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); context->new_cr3 = paging_new_cr3; @@ -3768,7 +3765,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; } - update_permission_bitmask(vcpu, context); + update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); return 0; @@ -3800,6 +3797,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); +int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + bool execonly) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + context->shadow_root_level = kvm_x86_ops->get_tdp_level(); + + context->nx = true; + context->new_cr3 = paging_new_cr3; + context->page_fault = ept_page_fault; + context->gva_to_gpa = ept_gva_to_gpa; + context->sync_page = ept_sync_page; + context->invlpg = ept_invlpg; + context->update_pte = ept_update_pte; + context->free = paging_free; + context->root_level = context->shadow_root_level; + context->root_hpa = INVALID_PAGE; + context->direct_map = false; + + update_permission_bitmask(vcpu, context, true); + reset_rsvds_bits_mask_ept(vcpu, context, execonly); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); + static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); @@ -3847,7 +3871,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->gva_to_gpa = paging32_gva_to_gpa_nested; } - update_permission_bitmask(vcpu, g_context); + update_permission_bitmask(vcpu, g_context, false); update_last_pte_bitmap(vcpu, g_context); return 0; @@ -3923,8 +3947,8 @@ static bool need_remote_flush(u64 old, u64 new) return true; if ((old ^ new) & PT64_BASE_ADDR_MASK) return true; - old ^= PT64_NX_MASK; - new ^= PT64_NX_MASK; + old ^= shadow_nx_mask; + new ^= shadow_nx_mask; return (old & ~new & PT64_PERM_MASK) != 0; } @@ -4182,7 +4206,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, switch (er) { case EMULATE_DONE: return 1; - case EMULATE_DO_MMIO: + case EMULATE_USER_EXIT: ++vcpu->stat.mmio_exits; /* fall through */ case EMULATE_FAIL: @@ -4390,23 +4414,19 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) /* * The very rare case: if the generation-number is round, * zap all shadow pages. - * - * The max value is MMIO_MAX_GEN - 1 since it is not called - * when mark memslot invalid. */ - if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) { + if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) { printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } } -static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long +mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; int nr_to_scan = sc->nr_to_scan; - - if (nr_to_scan == 0) - goto out; + unsigned long freed = 0; raw_spin_lock(&kvm_lock); @@ -4441,25 +4461,37 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) goto unlock; } - prepare_zap_oldest_mmu_page(kvm, &invalid_list); + if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) + freed++; kvm_mmu_commit_zap_page(kvm, &invalid_list); unlock: spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); + /* + * unfair on small ones + * per-vm shrinkers cry out + * sadness comes quickly + */ list_move_tail(&kvm->vm_list, &vm_list); break; } raw_spin_unlock(&kvm_lock); + return freed; -out: +} + +static unsigned long +mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ return percpu_counter_read_positive(&kvm_total_used_mmu_pages); } static struct shrinker mmu_shrinker = { - .shrink = mmu_shrink, + .count_objects = mmu_shrink_count, + .scan_objects = mmu_shrink_scan, .seeks = DEFAULT_SEEKS * 10, }; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 5b59c573aba..77e044a0f5f 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -71,6 +71,8 @@ enum { int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); +int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + bool execonly); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 7769699d48a..ad75d77999d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -23,6 +23,13 @@ * so the code in this file is compiled twice, once per pte size. */ +/* + * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro + * uses for EPT without A/D paging type. + */ +extern u64 __pure __using_nonexistent_pte_bit(void) + __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT"); + #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 @@ -32,6 +39,10 @@ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 #define CMPXCHG cmpxchg @@ -49,7 +60,26 @@ #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define CMPXCHG cmpxchg +#elif PTTYPE == PTTYPE_EPT + #define pt_element_t u64 + #define guest_walker guest_walkerEPT + #define FNAME(name) ept_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK 0 + #define PT_GUEST_DIRTY_MASK 0 + #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit() + #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit() + #define CMPXCHG cmpxchg64 + #define PT_MAX_FULL_LEVELS 4 #else #error Invalid PTTYPE value #endif @@ -69,6 +99,7 @@ struct guest_walker { pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; + bool pte_writable[PT_MAX_FULL_LEVELS]; unsigned pt_access; unsigned pte_access; gfn_t gfn; @@ -80,6 +111,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } +static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) +{ + unsigned mask; + + /* dirty bit is not supported, so no need to track it */ + if (!PT_GUEST_DIRTY_MASK) + return; + + BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); + + mask = (unsigned)~ACC_WRITE_MASK; + /* Allow write access to dirty gptes */ + mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & + PT_WRITABLE_MASK; + *access &= mask; +} + +static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) +{ + int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f; + + return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) | + ((mmu->bad_mt_xwr & (1ull << low6)) != 0); +} + +static inline int FNAME(is_present_gpte)(unsigned long pte) +{ +#if PTTYPE != PTTYPE_EPT + return is_present_gpte(pte); +#else + return pte & 7; +#endif +} + static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t __user *ptep_user, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) @@ -103,6 +168,42 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return (ret != orig_pte); } +static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + u64 gpte) +{ + if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; + + if (!FNAME(is_present_gpte)(gpte)) + goto no_present; + + /* if accessed bit is not supported prefetch non accessed gpte */ + if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK)) + goto no_present; + + return false; + +no_present: + drop_spte(vcpu->kvm, spte); + return true; +} + +static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) +{ + unsigned access; +#if PTTYPE == PTTYPE_EPT + access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | + ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | + ACC_USER_MASK; +#else + access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; + access &= ~(gpte >> PT64_NX_SHIFT); +#endif + + return access; +} + static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct guest_walker *walker, @@ -114,22 +215,43 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, gfn_t table_gfn; int ret; + /* dirty/accessed bits are not supported, so no need to update them */ + if (!PT_GUEST_DIRTY_MASK) + return 0; + for (level = walker->max_level; level >= walker->level; --level) { pte = orig_pte = walker->ptes[level - 1]; table_gfn = walker->table_gfn[level - 1]; ptep_user = walker->ptep_user[level - 1]; index = offset_in_page(ptep_user) / sizeof(pt_element_t); - if (!(pte & PT_ACCESSED_MASK)) { + if (!(pte & PT_GUEST_ACCESSED_MASK)) { trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); - pte |= PT_ACCESSED_MASK; + pte |= PT_GUEST_ACCESSED_MASK; } - if (level == walker->level && write_fault && !is_dirty_gpte(pte)) { + if (level == walker->level && write_fault && + !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - pte |= PT_DIRTY_MASK; + pte |= PT_GUEST_DIRTY_MASK; } if (pte == orig_pte) continue; + /* + * If the slot is read-only, simply do not process the accessed + * and dirty bits. This is the correct thing to do if the slot + * is ROM, and page tables in read-as-ROM/write-as-MMIO slots + * are only supported if the accessed and dirty bits are already + * set in the ROM (so that MMIO writes are never needed). + * + * Note that NPT does not allow this at all and faults, since + * it always wants nested page table entries for the guest + * page tables to be writable. And EPT works but will simply + * overwrite the read-only memory to set the accessed and dirty + * bits. + */ + if (unlikely(!walker->pte_writable[level - 1])) + continue; + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte); if (ret) return ret; @@ -170,7 +292,7 @@ retry_walk: if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) + if (!FNAME(is_present_gpte)(pte)) goto error; --walker->level; } @@ -179,7 +301,7 @@ retry_walk: ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); - accessed_dirty = PT_ACCESSED_MASK; + accessed_dirty = PT_GUEST_ACCESSED_MASK; pt_access = pte_access = ACC_ALL; ++walker->level; @@ -204,7 +326,8 @@ retry_walk: goto error; real_gfn = gpa_to_gfn(real_gfn); - host_addr = gfn_to_hva(vcpu->kvm, real_gfn); + host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn, + &walker->pte_writable[walker->level - 1]); if (unlikely(kvm_is_error_hva(host_addr))) goto error; @@ -215,17 +338,17 @@ retry_walk: trace_kvm_mmu_paging_element(pte, walker->level); - if (unlikely(!is_present_gpte(pte))) + if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; - if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, - walker->level))) { + if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, + walker->level))) { errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } accessed_dirty &= pte; - pte_access = pt_access & gpte_access(vcpu, pte); + pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); @@ -248,13 +371,15 @@ retry_walk: walker->gfn = real_gpa >> PAGE_SHIFT; if (!write_fault) - protect_clean_gpte(&pte_access, pte); + FNAME(protect_clean_gpte)(&pte_access, pte); else /* - * On a write fault, fold the dirty bit into accessed_dirty by - * shifting it one place right. + * On a write fault, fold the dirty bit into accessed_dirty. + * For modes without A/D bits support accessed_dirty will be + * always clear. */ - accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT); + accessed_dirty &= pte >> + (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); if (unlikely(!accessed_dirty)) { ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); @@ -279,6 +404,25 @@ error: walker->fault.vector = PF_VECTOR; walker->fault.error_code_valid = true; walker->fault.error_code = errcode; + +#if PTTYPE == PTTYPE_EPT + /* + * Use PFERR_RSVD_MASK in error_code to to tell if EPT + * misconfiguration requires to be injected. The detection is + * done by is_rsvd_bits_set() above. + * + * We set up the value of exit_qualification to inject: + * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation + * [5:3] - Calculated by the page walk of the guest EPT page tables + * [7:8] - Derived from [7:8] of real exit_qualification + * + * The other bits are set to 0. + */ + if (!(errcode & PFERR_RSVD_MASK)) { + vcpu->arch.exit_qualification &= 0x187; + vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3; + } +#endif walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; @@ -293,6 +437,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, access); } +#if PTTYPE != PTTYPE_EPT static int FNAME(walk_addr_nested)(struct guest_walker *walker, struct kvm_vcpu *vcpu, gva_t addr, u32 access) @@ -300,6 +445,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, addr, access); } +#endif static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -309,14 +455,14 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn_t gfn; pfn_t pfn; - if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); gfn = gpte_to_gfn(gpte); - pte_access = sp->role.access & gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + FNAME(protect_clean_gpte)(&pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); if (is_error_pfn(pfn)) @@ -446,7 +592,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, goto out_gpte_changed; if (sp) - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); } for (; @@ -466,7 +612,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, true, direct_access, it.sptep); - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); } clear_sp_write_flooding_count(it.sptep); @@ -727,6 +873,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, return gpa; } +#if PTTYPE != PTTYPE_EPT static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -745,6 +892,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, return gpa; } +#endif /* * Using the cached information from sp->gfns is safe because: @@ -785,15 +933,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) { + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { vcpu->kvm->tlbs_dirty++; continue; } gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; - pte_access &= gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); + pte_access &= FNAME(gpte_access)(vcpu, gpte); + FNAME(protect_clean_gpte)(&pte_access, gpte); if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, &nr_present)) @@ -830,3 +978,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef gpte_to_gfn #undef gpte_to_gfn_lvl #undef CMPXCHG +#undef PT_GUEST_ACCESSED_MASK +#undef PT_GUEST_DIRTY_MASK +#undef PT_GUEST_DIRTY_SHIFT +#undef PT_GUEST_ACCESSED_SHIFT diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index c53e797e736..5c4f63151b4 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -160,7 +160,7 @@ static void stop_counter(struct kvm_pmc *pmc) static void reprogram_counter(struct kvm_pmc *pmc, u32 type, unsigned config, bool exclude_user, bool exclude_kernel, - bool intr) + bool intr, bool in_tx, bool in_tx_cp) { struct perf_event *event; struct perf_event_attr attr = { @@ -173,6 +173,10 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type, .exclude_kernel = exclude_kernel, .config = config, }; + if (in_tx) + attr.config |= HSW_IN_TX; + if (in_tx_cp) + attr.config |= HSW_IN_TX_CHECKPOINTED; attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); @@ -226,7 +230,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_CMASK))) { + ARCH_PERFMON_EVENTSEL_CMASK | + HSW_IN_TX | + HSW_IN_TX_CHECKPOINTED))) { config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, unit_mask); if (config != PERF_COUNT_HW_MAX) @@ -239,7 +245,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) reprogram_counter(pmc, type, config, !(eventsel & ARCH_PERFMON_EVENTSEL_USR), !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT); + eventsel & ARCH_PERFMON_EVENTSEL_INT, + (eventsel & HSW_IN_TX), + (eventsel & HSW_IN_TX_CHECKPOINTED)); } static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) @@ -256,7 +264,7 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) arch_events[fixed_pmc_events[idx]].event_type, !(en & 0x2), /* exclude user */ !(en & 0x1), /* exclude kernel */ - pmi); + pmi, false, false); } static inline u8 fixed_en_pmi(u64 ctrl, int idx) @@ -408,7 +416,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel) return 0; - if (!(data & 0xffffffff00200000ull)) { + if (!(data & pmu->reserved_bits)) { reprogram_gp_counter(pmc, data); return 0; } @@ -450,6 +458,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_GP] = 0; pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->version = 0; + pmu->reserved_bits = 0xffffffff00200000ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry) @@ -478,6 +487,12 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; + + entry = kvm_find_cpuid_entry(vcpu, 7, 0); + if (entry && + (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && + (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) + pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; } void kvm_pmu_init(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 064d0be67ec..2b2fce1b200 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -373,6 +373,7 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + u64 msr_ia32_feature_control; }; #define POSTED_INTR_ON 0 @@ -711,10 +712,10 @@ static void nested_release_page_clean(struct page *page) kvm_release_page_clean(page); } +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -1039,12 +1040,16 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) (vmcs12->secondary_vm_exec_control & bit); } -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, - struct kvm_vcpu *vcpu) +static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) { return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } +static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); +} + static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -2155,6 +2160,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; static u32 nested_vmx_misc_low, nested_vmx_misc_high; +static u32 nested_vmx_ept_caps; static __init void nested_vmx_setup_ctls_msrs(void) { /* @@ -2190,14 +2196,17 @@ static __init void nested_vmx_setup_ctls_msrs(void) * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and * 17 must be 1. */ + rdmsr(MSR_IA32_VMX_EXIT_CTLS, + nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ + nested_vmx_exit_ctls_high &= #ifdef CONFIG_X86_64 - nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; -#else - nested_vmx_exit_ctls_high = 0; + VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER); /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2205,8 +2214,12 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */ nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_entry_ctls_high &= - VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; - nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; +#ifdef CONFIG_X86_64 + VM_ENTRY_IA32E_MODE | +#endif + VM_ENTRY_LOAD_IA32_PAT; + nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | + VM_ENTRY_LOAD_IA32_EFER); /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, @@ -2241,6 +2254,22 @@ static __init void nested_vmx_setup_ctls_msrs(void) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_WBINVD_EXITING; + if (enable_ept) { + /* nested EPT: emulate EPT also to L1 */ + nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; + nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | + VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; + nested_vmx_ept_caps &= vmx_capability.ept; + /* + * Since invept is completely emulated we support both global + * and context invalidation independent of what host cpu + * supports + */ + nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | + VMX_EPT_EXTENT_CONTEXT_BIT; + } else + nested_vmx_ept_caps = 0; + /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | @@ -2282,8 +2311,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) switch (msr_index) { case MSR_IA32_FEATURE_CONTROL: - *pdata = 0; - break; + if (nested_vmx_allowed(vcpu)) { + *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control; + break; + } + return 0; case MSR_IA32_VMX_BASIC: /* * This MSR reports some information about VMX support. We @@ -2346,8 +2378,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) nested_vmx_secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP: - /* Currently, no nested ept or nested vpid */ - *pdata = 0; + /* Currently, no nested vpid support */ + *pdata = nested_vmx_ept_caps; break; default: return 0; @@ -2356,14 +2388,24 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 1; } -static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { + u32 msr_index = msr_info->index; + u64 data = msr_info->data; + bool host_initialized = msr_info->host_initiated; + if (!nested_vmx_allowed(vcpu)) return 0; - if (msr_index == MSR_IA32_FEATURE_CONTROL) - /* TODO: the right thing. */ + if (msr_index == MSR_IA32_FEATURE_CONTROL) { + if (!host_initialized && + to_vmx(vcpu)->nested.msr_ia32_feature_control + & FEATURE_CONTROL_LOCKED) + return 0; + to_vmx(vcpu)->nested.msr_ia32_feature_control = data; return 1; + } + /* * No need to treat VMX capability MSRs specially: If we don't handle * them, handle_wrmsr will #GP(0), which is correct (they are readonly) @@ -2494,7 +2536,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; /* Otherwise falls through */ default: - if (vmx_set_vmx_msr(vcpu, msr_index, data)) + if (vmx_set_vmx_msr(vcpu, msr_info)) break; msr = find_msr_entry(vmx, msr_index); if (msr) { @@ -3213,25 +3255,29 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) static void ept_load_pdptrs(struct kvm_vcpu *vcpu) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + if (!test_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_dirty)) return; if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]); - vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]); - vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]); - vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]); + vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); + vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); + vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); + vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); } } static void ept_save_pdptrs(struct kvm_vcpu *vcpu) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); } __set_bit(VCPU_EXREG_PDPTR, @@ -5297,14 +5343,29 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) return 0; } + /* + * EPT violation happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + * There are errata that may cause this bit to not be set: + * AAK134, BY25. + */ + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + cpu_has_virtual_nmis() && + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); /* It is a write fault? */ error_code = exit_qualification & (1U << 1); + /* It is a fetch fault? */ + error_code |= (exit_qualification & (1U << 2)) << 2; /* ept page table is present? */ error_code |= (exit_qualification >> 3) & 0x1; + vcpu->arch.exit_qualification = exit_qualification; + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } @@ -5438,7 +5499,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); - if (err == EMULATE_DO_MMIO) { + if (err == EMULATE_USER_EXIT) { + ++vcpu->stat.mmio_exits; ret = 0; goto out; } @@ -5567,8 +5629,47 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) free_loaded_vmcs(&vmx->vmcs01); } +/* + * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), + * set the success or error code of an emulated VMX instruction, as specified + * by Vol 2B, VMX Instruction Reference, "Conventions". + */ +static void nested_vmx_succeed(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); +} + +static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_CF); +} + static void nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error); + u32 vm_instruction_error) +{ + if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { + /* + * failValid writes the error number to the current VMCS, which + * can't be done there isn't a current VMCS. + */ + nested_vmx_failInvalid(vcpu); + return; + } + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_ZF); + get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; + /* + * We don't need to force a shadow sync because + * VM_INSTRUCTION_ERROR is not shadowed + */ +} /* * Emulate the VMXON instruction. @@ -5583,6 +5684,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu) struct kvm_segment cs; struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs *shadow_vmcs; + const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED + | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; /* The Intel VMX Instruction Reference lists a bunch of bits that * are prerequisite to running VMXON, most notably cr4.VMXE must be @@ -5611,6 +5714,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } + + if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES) + != VMXON_NEEDED_FEATURES) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); if (!shadow_vmcs) @@ -5628,6 +5738,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) vmx->nested.vmxon = true; skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); return 1; } @@ -5712,6 +5823,7 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) return 1; free_nested(to_vmx(vcpu)); skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); return 1; } @@ -5768,48 +5880,6 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, return 0; } -/* - * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), - * set the success or error code of an emulated VMX instruction, as specified - * by Vol 2B, VMX Instruction Reference, "Conventions". - */ -static void nested_vmx_succeed(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); -} - -static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_CF); -} - -static void nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error) -{ - if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { - /* - * failValid writes the error number to the current VMCS, which - * can't be done there isn't a current VMCS. - */ - nested_vmx_failInvalid(vcpu); - return; - } - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_ZF); - get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; - /* - * We don't need to force a shadow sync because - * VM_INSTRUCTION_ERROR is not shadowed - */ -} - /* Emulate the VMCLEAR instruction */ static int handle_vmclear(struct kvm_vcpu *vcpu) { @@ -5972,8 +6042,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) unsigned long field; u64 field_value; struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; - unsigned long *fields = (unsigned long *)shadow_read_write_fields; - int num_fields = max_shadow_read_write_fields; + const unsigned long *fields = shadow_read_write_fields; + const int num_fields = max_shadow_read_write_fields; vmcs_load(shadow_vmcs); @@ -6002,12 +6072,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) { - unsigned long *fields[] = { - (unsigned long *)shadow_read_write_fields, - (unsigned long *)shadow_read_only_fields + const unsigned long *fields[] = { + shadow_read_write_fields, + shadow_read_only_fields }; - int num_lists = ARRAY_SIZE(fields); - int max_fields[] = { + const int max_fields[] = { max_shadow_read_write_fields, max_shadow_read_only_fields }; @@ -6018,7 +6087,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) vmcs_load(shadow_vmcs); - for (q = 0; q < num_lists; q++) { + for (q = 0; q < ARRAY_SIZE(fields); q++) { for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; vmcs12_read_any(&vmx->vcpu, field, &field_value); @@ -6248,6 +6317,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return 1; } +/* Emulate the INVEPT instruction */ +static int handle_invept(struct kvm_vcpu *vcpu) +{ + u32 vmx_instruction_info, types; + unsigned long type; + gva_t gva; + struct x86_exception e; + struct { + u64 eptp, gpa; + } operand; + u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK; + + if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || + !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + + types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; + + if (!(types & (1UL << type))) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return 1; + } + + /* According to the Intel VMX instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, &gva)) + return 1; + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, + sizeof(operand), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (type) { + case VMX_EPT_EXTENT_CONTEXT: + if ((operand.eptp & eptp_mask) != + (nested_ept_get_cr3(vcpu) & eptp_mask)) + break; + case VMX_EPT_EXTENT_GLOBAL: + kvm_mmu_sync_roots(vcpu); + kvm_mmu_flush_tlb(vcpu); + nested_vmx_succeed(vcpu); + break; + default: + BUG_ON(1); + break; + } + + skip_emulated_instruction(vcpu); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -6292,6 +6429,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_INVEPT] = handle_invept, }; static const int kvm_vmx_max_exit_handlers = @@ -6518,6 +6656,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: + case EXIT_REASON_INVEPT: /* * VMX instructions trap unconditionally. This allows L1 to * emulate them for its L2 guest, i.e., allows 3-level nesting! @@ -6550,7 +6689,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); case EXIT_REASON_EPT_VIOLATION: + /* + * L0 always deals with the EPT violation. If nested EPT is + * used, and the nested mmu code discovers that the address is + * missing in the guest EPT table (EPT12), the EPT violation + * will be injected with nested_ept_inject_page_fault() + */ + return 0; case EXIT_REASON_EPT_MISCONFIG: + /* + * L2 never uses directly L1's EPT, but rather L0's own EPT + * table (shadow on EPT) or a merged EPT table that L0 built + * (EPT on EPT). So any problems with the structure of the + * table is L0's fault. + */ return 0; case EXIT_REASON_PREEMPTION_TIMER: return vmcs12->pin_based_vm_exec_control & @@ -6638,7 +6790,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( - get_vmcs12(vcpu), vcpu)))) { + get_vmcs12(vcpu))))) { if (vmx_interrupt_allowed(vcpu)) { vmx->soft_vnmi_blocked = 0; } else if (vmx->vnmi_blocked_time > 1000000000LL && @@ -7326,6 +7478,48 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) entry->ecx |= bit(X86_FEATURE_VMX); } +static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12; + nested_vmx_vmexit(vcpu); + vmcs12 = get_vmcs12(vcpu); + + if (fault->error_code & PFERR_RSVD_MASK) + vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; + else + vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + vmcs12->exit_qualification = vcpu->arch.exit_qualification; + vmcs12->guest_physical_address = fault->address; +} + +/* Callbacks for nested_ept_init_mmu_context: */ + +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) +{ + /* return the page table to be shadowed - in our case, EPT12 */ + return get_vmcs12(vcpu)->ept_pointer; +} + +static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +{ + int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, + nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); + + vcpu->arch.mmu.set_cr3 = vmx_set_cr3; + vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; + vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; + + vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; + + return r; +} + +static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.walk_mmu = &vcpu->arch.mmu; +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -7388,7 +7582,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_interruptibility_info); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); + vmx_set_rflags(vcpu, vmcs12->guest_rflags); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, vmcs12->guest_pending_dbg_exceptions); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); @@ -7508,15 +7702,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ - vmcs_write32(VM_EXIT_CONTROLS, - vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | + /* L2->L1 exit controls are emulated - the hardware exit is to L0 so + * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER + * bits are further modified by vmx_set_efer() below. + */ + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + + /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are + * emulated by vmx_set_efer(), below. + */ + vmcs_write32(VM_ENTRY_CONTROLS, + (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & + ~VM_ENTRY_IA32E_MODE) | (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); - else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + vcpu->arch.pat = vmcs12->guest_ia32_pat; + } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); @@ -7538,6 +7741,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_flush_tlb(vcpu); } + if (nested_cpu_has_ept(vmcs12)) { + kvm_mmu_unload(vcpu); + nested_ept_init_mmu_context(vcpu); + } + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->guest_ia32_efer; else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) @@ -7565,6 +7773,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_set_cr3(vcpu, vmcs12->guest_cr3); kvm_mmu_reset_context(vcpu); + /* + * L1 may access the L2's PDPTR, so save them to construct vmcs12 + */ + if (enable_ept) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + } + kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); } @@ -7887,6 +8105,22 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + /* + * In some cases (usually, nested EPT), L2 is allowed to change its + * own CR3 without exiting. If it has changed it, we must keep it. + * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined + * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. + * + * Additionally, restore L2's PDPTR to vmcs12. + */ + if (enable_ept) { + vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3); + vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); + vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); + vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); + vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + } + vmcs12->vm_entry_controls = (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE); @@ -7948,6 +8182,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { + struct kvm_segment seg; + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) @@ -7982,7 +8218,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); kvm_set_cr4(vcpu, vmcs12->host_cr4); - /* shadow page tables on either EPT or shadow page tables */ + if (nested_cpu_has_ept(vmcs12)) + nested_ept_uninit_mmu_context(vcpu); + kvm_set_cr3(vcpu, vmcs12->host_cr3); kvm_mmu_reset_context(vcpu); @@ -8001,23 +8239,61 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); - vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base); - vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base); - vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base); - vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector); - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector); - vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector); - vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector); - vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector); - vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector); - vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector); - - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) + + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); + vcpu->arch.pat = vmcs12->host_ia32_pat; + } if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl); + /* Set L1 segment info according to Intel SDM + 27.5.2 Loading Host Segment and Descriptor-Table Registers */ + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .selector = vmcs12->host_cs_selector, + .type = 11, + .present = 1, + .s = 1, + .g = 1 + }; + if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + seg.l = 1; + else + seg.db = 1; + vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .type = 3, + .present = 1, + .s = 1, + .db = 1, + .g = 1 + }; + seg.selector = vmcs12->host_ds_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); + seg.selector = vmcs12->host_es_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); + seg.selector = vmcs12->host_ss_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); + seg.selector = vmcs12->host_fs_selector; + seg.base = vmcs12->host_fs_base; + vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); + seg.selector = vmcs12->host_gs_selector; + seg.base = vmcs12->host_gs_base; + vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); + seg = (struct kvm_segment) { + .base = vmcs12->host_tr_base, + .limit = 0x67, + .selector = vmcs12->host_tr_selector, + .type = 11, + .present = 1 + }; + vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d21bce50531..e5ca72a5cdb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -682,17 +682,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) */ } - /* - * Does the new cr3 value map to physical memory? (Note, we - * catch an invalid cr3 even in real-mode, because it would - * cause trouble later on when we turn on paging anyway.) - * - * A real CPU would silently accept an invalid cr3 and would - * attempt to use it - with largely undefined (and often hard - * to debug) behavior on the guest side. - */ - if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) - return 1; vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); vcpu->arch.mmu.new_cr3(vcpu); @@ -850,7 +839,8 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL }; static unsigned num_msrs_to_save; @@ -1457,6 +1447,29 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) #endif } +static void kvm_gen_update_masterclock(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + int i; + struct kvm_vcpu *vcpu; + struct kvm_arch *ka = &kvm->arch; + + spin_lock(&ka->pvclock_gtod_sync_lock); + kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ + pvclock_update_vm_gtod_copy(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + + /* guest entries allowed */ + kvm_for_each_vcpu(i, vcpu, kvm) + clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); + + spin_unlock(&ka->pvclock_gtod_sync_lock); +#endif +} + static int kvm_guest_time_update(struct kvm_vcpu *v) { unsigned long flags, this_tsc_khz; @@ -3806,6 +3819,7 @@ long kvm_arch_vm_ioctl(struct file *filp, delta = user_ns.clock - now_ns; local_irq_enable(); kvm->arch.kvmclock_offset = delta; + kvm_gen_update_masterclock(kvm); break; } case KVM_GET_CLOCK: { @@ -4955,6 +4969,97 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); +static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, + unsigned long *db) +{ + u32 dr6 = 0; + int i; + u32 enable, rwlen; + + enable = dr7; + rwlen = dr7 >> 16; + for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4) + if ((enable & 3) && (rwlen & 15) == type && db[i] == addr) + dr6 |= (1 << i); + return dr6; +} + +static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r) +{ + struct kvm_run *kvm_run = vcpu->run; + + /* + * Use the "raw" value to see if TF was passed to the processor. + * Note that the new value of the flags has not been saved yet. + * + * This is correct even for TF set by the guest, because "the + * processor will not generate this exception after the instruction + * that sets the TF flag". + */ + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + + if (unlikely(rflags & X86_EFLAGS_TF)) { + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1; + kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + } else { + vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; + /* + * "Certain debug exceptions may clear bit 0-3. The + * remaining contents of the DR6 register are never + * cleared by the processor". + */ + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= DR6_BS; + kvm_queue_exception(vcpu, DB_VECTOR); + } + } +} + +static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) +{ + struct kvm_run *kvm_run = vcpu->run; + unsigned long eip = vcpu->arch.emulate_ctxt.eip; + u32 dr6 = 0; + + if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && + (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { + dr6 = kvm_vcpu_check_hw_bp(eip, 0, + vcpu->arch.guest_debug_dr7, + vcpu->arch.eff_db); + + if (dr6 != 0) { + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; + kvm_run->debug.arch.pc = kvm_rip_read(vcpu) + + get_segment_base(vcpu, VCPU_SREG_CS); + + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + return true; + } + } + + if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) { + dr6 = kvm_vcpu_check_hw_bp(eip, 0, + vcpu->arch.dr7, + vcpu->arch.db); + + if (dr6 != 0) { + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= dr6; + kvm_queue_exception(vcpu, DB_VECTOR); + *r = EMULATE_DONE; + return true; + } + } + + return false; +} + int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, int emulation_type, @@ -4975,6 +5080,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (!(emulation_type & EMULTYPE_NO_DECODE)) { init_emulate_ctxt(vcpu); + + /* + * We will reenter on the same instruction since + * we do not set complete_userspace_io. This does not + * handle watchpoints yet, those would be handled in + * the emulate_ops. + */ + if (kvm_vcpu_check_breakpoint(vcpu, &r)) + return r; + ctxt->interruptibility = 0; ctxt->have_exception = false; ctxt->perm_ok = false; @@ -5031,17 +5146,18 @@ restart: inject_emulated_exception(vcpu); r = EMULATE_DONE; } else if (vcpu->arch.pio.count) { - if (!vcpu->arch.pio.in) + if (!vcpu->arch.pio.in) { + /* FIXME: return into emulator if single-stepping. */ vcpu->arch.pio.count = 0; - else { + } else { writeback = false; vcpu->arch.complete_userspace_io = complete_emulated_pio; } - r = EMULATE_DO_MMIO; + r = EMULATE_USER_EXIT; } else if (vcpu->mmio_needed) { if (!vcpu->mmio_is_write) writeback = false; - r = EMULATE_DO_MMIO; + r = EMULATE_USER_EXIT; vcpu->arch.complete_userspace_io = complete_emulated_mmio; } else if (r == EMULATION_RESTART) goto restart; @@ -5050,10 +5166,12 @@ restart: if (writeback) { toggle_interruptibility(vcpu, ctxt->interruptibility); - kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); + if (r == EMULATE_DONE) + kvm_vcpu_check_singlestep(vcpu, &r); + kvm_set_rflags(vcpu, ctxt->eflags); } else vcpu->arch.emulate_regs_need_sync_to_vcpu = true; @@ -5347,7 +5465,7 @@ static struct notifier_block pvclock_gtod_notifier = { int kvm_arch_init(void *opaque) { int r; - struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; + struct kvm_x86_ops *ops = opaque; if (kvm_x86_ops) { printk(KERN_ERR "kvm: already loaded the other module\n"); @@ -5495,6 +5613,23 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) return 1; } +/* + * kvm_pv_kick_cpu_op: Kick a vcpu. + * + * @apicid - apicid of vcpu to be kicked. + */ +static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) +{ + struct kvm_lapic_irq lapic_irq; + + lapic_irq.shorthand = 0; + lapic_irq.dest_mode = 0; + lapic_irq.dest_id = apicid; + + lapic_irq.delivery_mode = APIC_DM_REMRD; + kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL); +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; @@ -5528,6 +5663,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_VAPIC_POLL_IRQ: ret = 0; break; + case KVM_HC_KICK_CPU: + kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); + ret = 0; + break; default: ret = -KVM_ENOSYS; break; @@ -5689,29 +5828,6 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } -static void kvm_gen_update_masterclock(struct kvm *kvm) -{ -#ifdef CONFIG_X86_64 - int i; - struct kvm_vcpu *vcpu; - struct kvm_arch *ka = &kvm->arch; - - spin_lock(&ka->pvclock_gtod_sync_lock); - kvm_make_mclock_inprogress_request(kvm); - /* no guest entries from this point */ - pvclock_update_vm_gtod_copy(kvm); - - kvm_for_each_vcpu(i, vcpu, kvm) - set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); - - /* guest entries allowed */ - kvm_for_each_vcpu(i, vcpu, kvm) - clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); - - spin_unlock(&ka->pvclock_gtod_sync_lock); -#endif -} - static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { u64 eoi_exit_bitmap[4]; @@ -5950,6 +6066,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) kvm_apic_accept_events(vcpu); switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: + vcpu->arch.pv.pv_unhalted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; case KVM_MP_STATE_RUNNABLE: @@ -6061,6 +6178,8 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { vcpu->mmio_needed = 0; + + /* FIXME: return into emulator if single-stepping. */ if (vcpu->mmio_is_write) return 1; vcpu->mmio_read_completed = 1; @@ -6249,7 +6368,12 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { kvm_apic_accept_events(vcpu); - mp_state->mp_state = vcpu->arch.mp_state; + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && + vcpu->arch.pv.pv_unhalted) + mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + else + mp_state->mp_state = vcpu->arch.mp_state; + return 0; } @@ -6770,6 +6894,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; + vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -7019,6 +7144,15 @@ out_free: return -ENOMEM; } +void kvm_arch_memslots_updated(struct kvm *kvm) +{ + /* + * memslots->generation has been incremented. + * mmio generation may have reached its maximum value. + */ + kvm_mmu_invalidate_mmio_sptes(kvm); +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_userspace_memory_region *mem, @@ -7079,11 +7213,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, */ if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_slot_remove_write_access(kvm, mem->slot); - /* - * If memory slot is created, or moved, we need to clear all - * mmio sptes. - */ - kvm_mmu_invalidate_mmio_sptes(kvm); } void kvm_arch_flush_shadow_all(struct kvm *kvm) @@ -7103,6 +7232,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) || kvm_apic_has_events(vcpu) + || vcpu->arch.pv.pv_unhalted || atomic_read(&vcpu->arch.nmi_queued) || (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 6a22c19da66..bdf8532494f 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -7,8 +7,7 @@ * kernel and insert a module (lg.ko) which allows us to run other Linux * kernels the same way we'd run processes. We call the first kernel the Host, * and the others the Guests. The program which sets up and configures Guests - * (such as the example in Documentation/virtual/lguest/lguest.c) is called the - * Launcher. + * (such as the example in tools/lguest/lguest.c) is called the Launcher. * * Secondly, we only run specially modified Guests, not normal kernels: setting * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows @@ -1057,6 +1056,12 @@ static void lguest_load_sp0(struct tss_struct *tss, } /* Let's just say, I wouldn't do debugging under a Guest. */ +static unsigned long lguest_get_debugreg(int regno) +{ + /* FIXME: Implement */ + return 0; +} + static void lguest_set_debugreg(int regno, unsigned long value) { /* FIXME: Implement */ @@ -1304,6 +1309,7 @@ __init void lguest_init(void) pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; pv_cpu_ops.set_ldt = lguest_set_ldt; pv_cpu_ops.load_tls = lguest_load_tls; + pv_cpu_ops.get_debugreg = lguest_get_debugreg; pv_cpu_ops.set_debugreg = lguest_set_debugreg; pv_cpu_ops.clts = lguest_clts; pv_cpu_ops.read_cr0 = lguest_read_cr0; diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 96b2c6697c9..992d63bb154 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -16,7 +16,7 @@ clean-files := inat-tables.c obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o -lib-y := delay.o +lib-y := delay.o misc.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 25b7ae8d058..7609e0e421e 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -6,6 +6,7 @@ */ #include <asm/checksum.h> #include <linux/module.h> +#include <asm/smap.h> /** * csum_partial_copy_from_user - Copy and checksum from user space. @@ -52,8 +53,10 @@ csum_partial_copy_from_user(const void __user *src, void *dst, len -= 2; } } + stac(); isum = csum_partial_copy_generic((__force const void *)src, dst, len, isum, errp, NULL); + clac(); if (unlikely(*errp)) goto out_err; @@ -82,6 +85,8 @@ __wsum csum_partial_copy_to_user(const void *src, void __user *dst, int len, __wsum isum, int *errp) { + __wsum ret; + might_sleep(); if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) { @@ -105,8 +110,11 @@ csum_partial_copy_to_user(const void *src, void __user *dst, } *errp = 0; - return csum_partial_copy_generic(src, (void __force *)dst, - len, isum, NULL, errp); + stac(); + ret = csum_partial_copy_generic(src, (void __force *)dst, + len, isum, NULL, errp); + clac(); + return ret; } EXPORT_SYMBOL(csum_partial_copy_to_user); diff --git a/arch/x86/lib/misc.c b/arch/x86/lib/misc.c new file mode 100644 index 00000000000..76b373af03f --- /dev/null +++ b/arch/x86/lib/misc.c @@ -0,0 +1,21 @@ +/* + * Count the digits of @val including a possible sign. + * + * (Typed on and submitted from hpa's mobile phone.) + */ +int num_digits(int val) +{ + int m = 10; + int d = 1; + + if (val < 0) { + d++; + val = -val; + } + + while (val >= m) { + m *= 10; + d++; + } + return d; +} diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index 4f74d94c8d9..ddf9ecb53cc 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -11,39 +11,26 @@ #include <linux/sched.h> /* - * best effort, GUP based copy_from_user() that is NMI-safe + * We rely on the nested NMI work to allow atomic faults from the NMI path; the + * nested NMI paths are careful to preserve CR2. */ unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n) { - unsigned long offset, addr = (unsigned long)from; - unsigned long size, len = 0; - struct page *page; - void *map; - int ret; + unsigned long ret; if (__range_not_ok(from, n, TASK_SIZE)) - return len; - - do { - ret = __get_user_pages_fast(addr, 1, 0, &page); - if (!ret) - break; - - offset = addr & (PAGE_SIZE - 1); - size = min(PAGE_SIZE - offset, n - len); - - map = kmap_atomic(page); - memcpy(to, map+offset, size); - kunmap_atomic(map); - put_page(page); - - len += size; - to += size; - addr += size; - - } while (len < n); - - return len; + return 0; + + /* + * Even though this function is typically called from NMI/IRQ context + * disable pagefaults so that its behaviour is consistent even when + * called form other contexts. + */ + pagefault_disable(); + ret = __copy_from_user_inatomic(to, from, n); + pagefault_enable(); + + return ret; } EXPORT_SYMBOL_GPL(copy_from_user_nmi); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 3eb18acd0e4..e2f5e21c03b 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -654,14 +654,13 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); * Returns number of bytes that could not be copied. * On success, this will be zero. */ -unsigned long -copy_to_user(void __user *to, const void *from, unsigned long n) +unsigned long _copy_to_user(void __user *to, const void *from, unsigned n) { if (access_ok(VERIFY_WRITE, to, n)) n = __copy_to_user(to, from, n); return n; } -EXPORT_SYMBOL(copy_to_user); +EXPORT_SYMBOL(_copy_to_user); /** * copy_from_user: - Copy a block of data from user space. @@ -679,8 +678,7 @@ EXPORT_SYMBOL(copy_to_user); * If some data could not be copied, this function will pad the copied * data to the requested size using zero bytes. */ -unsigned long -_copy_from_user(void *to, const void __user *from, unsigned long n) +unsigned long _copy_from_user(void *to, const void __user *from, unsigned n) { if (access_ok(VERIFY_READ, from, n)) n = __copy_from_user(to, from, n); diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 906fea31579..c905e89e19f 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -68,7 +68,7 @@ EXPORT_SYMBOL(copy_in_user); * Since protection fault in copy_from/to_user is not a normal situation, * it is not necessary to optimize tail handling. */ -unsigned long +__visible unsigned long copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest) { char c; diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 5d7e51f3fd2..533a85e3a07 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -1,10 +1,8 @@ # x86 Opcode Maps # # This is (mostly) based on following documentations. -# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2 -# (#325383-040US, October 2011) -# - Intel(R) Advanced Vector Extensions Programming Reference -# (#319433-011,JUNE 2011). +# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2C +# (#326018-047US, June 2013) # #<Opcode maps> # Table: table-name @@ -29,6 +27,7 @@ # - (F3): the last prefix is 0xF3 # - (F2): the last prefix is 0xF2 # - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) +# - (66&F2): Both 0x66 and 0xF2 prefixes are specified. Table: one byte opcode Referrer: @@ -246,8 +245,8 @@ c2: RETN Iw (f64) c3: RETN c4: LES Gz,Mp (i64) | VEX+2byte (Prefix) c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix) -c6: Grp11 Eb,Ib (1A) -c7: Grp11 Ev,Iz (1A) +c6: Grp11A Eb,Ib (1A) +c7: Grp11B Ev,Iz (1A) c8: ENTER Iw,Ib c9: LEAVE (d64) ca: RETF Iw @@ -293,8 +292,8 @@ ef: OUT DX,eAX # 0xf0 - 0xff f0: LOCK (Prefix) f1: -f2: REPNE (Prefix) -f3: REP/REPE (Prefix) +f2: REPNE (Prefix) | XACQUIRE (Prefix) +f3: REP/REPE (Prefix) | XRELEASE (Prefix) f4: HLT f5: CMC f6: Grp3_1 Eb (1A) @@ -326,7 +325,8 @@ AVXcode: 1 0a: 0b: UD2 (1B) 0c: -0d: NOP Ev | GrpP +# AMD's prefetch group. Intel supports prefetchw(/1) only. +0d: GrpP 0e: FEMMS # 3DNow! uses the last imm byte as opcode extension. 0f: 3DNow! Pq,Qq,Ib @@ -729,12 +729,12 @@ dc: VAESENC Vdq,Hdq,Wdq (66),(v1) dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) de: VAESDEC Vdq,Hdq,Wdq (66),(v1) df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1) -f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) -f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) +f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2) +f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2) f2: ANDN Gy,By,Ey (v) f3: Grp17 (1A) f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) -f6: MULX By,Gy,rDX,Ey (F2),(v) +f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) EndTable @@ -861,8 +861,8 @@ EndTable GrpTable: Grp7 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) -1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) -2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) 3: LIDT Ms 4: SMSW Mw/Rv 5: @@ -880,15 +880,21 @@ EndTable GrpTable: Grp9 1: CMPXCHG8B/16B Mq/Mdq 6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) -7: VMPTRST Mq | VMPTRST Mq (F3) +7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B) EndTable GrpTable: Grp10 EndTable -GrpTable: Grp11 -# Note: the operands are given by group opcode -0: MOV +# Grp11A and Grp11B are expressed as Grp11 in Intel SDM +GrpTable: Grp11A +0: MOV Eb,Ib +7: XABORT Ib (000),(11B) +EndTable + +GrpTable: Grp11B +0: MOV Eb,Iz +7: XBEGIN Jz (000),(11B) EndTable GrpTable: Grp12 diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 654be4ae304..7a517bb4106 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -51,7 +51,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr) return 0; } -static inline int __kprobes notify_page_fault(struct pt_regs *regs) +static inline int __kprobes kprobes_fault(struct pt_regs *regs) { int ret = 0; @@ -842,23 +842,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, force_sig_info_fault(SIGBUS, code, address, tsk, fault); } -static noinline int +static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { - /* - * Pagefault was interrupted by SIGKILL. We have no reason to - * continue pagefault. - */ - if (fatal_signal_pending(current)) { - if (!(fault & VM_FAULT_RETRY)) - up_read(¤t->mm->mmap_sem); - if (!(error_code & PF_USER)) - no_context(regs, error_code, address, 0, 0); - return 1; + if (fatal_signal_pending(current) && !(error_code & PF_USER)) { + up_read(¤t->mm->mmap_sem); + no_context(regs, error_code, address, 0, 0); + return; } - if (!(fault & VM_FAULT_ERROR)) - return 0; if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ @@ -866,7 +858,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, up_read(¤t->mm->mmap_sem); no_context(regs, error_code, address, SIGSEGV, SEGV_MAPERR); - return 1; + return; } up_read(¤t->mm->mmap_sem); @@ -884,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, else BUG(); } - return 1; } static int spurious_fault_check(unsigned long error_code, pte_t *pte) @@ -1011,9 +1002,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) unsigned long address; struct mm_struct *mm; int fault; - int write = error_code & PF_WRITE; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; mm = tsk->mm; @@ -1059,7 +1048,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) return; /* kprobes don't want to hook the spurious faults: */ - if (notify_page_fault(regs)) + if (kprobes_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch @@ -1071,22 +1060,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) } /* kprobes don't want to hook the spurious faults: */ - if (unlikely(notify_page_fault(regs))) + if (unlikely(kprobes_fault(regs))) return; - /* - * It's safe to allow irq's after cr2 has been saved and the - * vmalloc fault has been handled. - * - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet: - */ - if (user_mode_vm(regs)) { - local_irq_enable(); - error_code |= PF_USER; - } else { - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); - } if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); @@ -1098,8 +1073,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) } } - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - /* * If we're in an interrupt, have no user context or are running * in an atomic region then we must not take the fault: @@ -1110,6 +1083,27 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) } /* + * It's safe to allow irq's after cr2 has been saved and the + * vmalloc fault has been handled. + * + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet: + */ + if (user_mode_vm(regs)) { + local_irq_enable(); + error_code |= PF_USER; + flags |= FAULT_FLAG_USER; + } else { + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); + } + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + + if (error_code & PF_WRITE) + flags |= FAULT_FLAG_WRITE; + + /* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in * the kernel and should generate an OOPS. Unfortunately, in the @@ -1187,9 +1181,17 @@ good_area: */ fault = handle_mm_fault(mm, vma, address, flags); - if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { - if (mm_fault_error(regs, error_code, address, fault)) - return; + /* + * If we need to retry but a fatal signal is pending, handle the + * signal first. We do not need to release the mmap_sem because it + * would already be released in __lock_page_or_retry in mm/filemap.c. + */ + if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))) + return; + + if (unlikely(fault & VM_FAULT_ERROR)) { + mm_fault_error(regs, error_code, address, fault); + return; } /* diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 7e73e8c6909..9d980d88b74 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -59,6 +59,10 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } +int pmd_huge_support(void) +{ + return 0; +} #else struct page * @@ -77,6 +81,10 @@ int pud_huge(pud_t pud) return !!(pud_val(pud) & _PAGE_PSE); } +int pmd_huge_support(void) +{ + return 1; +} #endif /* x86_64 also uses this file */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 04664cdb7fd..ce32017c5e3 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -399,8 +399,25 @@ static unsigned long __init init_range_memory_mapping( return mapped_ram_size; } -/* (PUD_SHIFT-PMD_SHIFT)/2 */ -#define STEP_SIZE_SHIFT 5 +static unsigned long __init get_new_step_size(unsigned long step_size) +{ + /* + * Explain why we shift by 5 and why we don't have to worry about + * 'step_size << 5' overflowing: + * + * initial mapped size is PMD_SIZE (2M). + * We can not set step_size to be PUD_SIZE (1G) yet. + * In worse case, when we cross the 1G boundary, and + * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k) + * to map 1G range with PTE. Use 5 as shift for now. + * + * Don't need to worry about overflow, on 32bit, when step_size + * is 0, round_down() returns 0 for start, and that turns it + * into 0x100000000ULL. + */ + return step_size << 5; +} + void __init init_mem_mapping(void) { unsigned long end, real_end, start, last_start; @@ -449,7 +466,7 @@ void __init init_mem_mapping(void) min_pfn_mapped = last_start >> PAGE_SHIFT; /* only increase step_size after big range get mapped */ if (new_mapped_ram_size > mapped_ram_size) - step_size <<= STEP_SIZE_SHIFT; + step_size = get_new_step_size(step_size); mapped_ram_size += new_mapped_ram_size; } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0215e2c563e..799580cabc7 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -487,7 +487,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) unsigned long offset; resource_size_t last_addr; unsigned int nrpages; - enum fixed_addresses idx0, idx; + enum fixed_addresses idx; int i, slot; WARN_ON(system_state != SYSTEM_BOOTING); @@ -540,8 +540,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) /* * Ok, go for it.. */ - idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; - idx = idx0; + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; while (nrpages > 0) { early_set_fixmap(idx, phys_addr, prot); phys_addr += PAGE_SIZE; diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index cdd0da9dd53..266ca912f62 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -146,6 +146,7 @@ int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { u64 start, end; + u32 hotpluggable; int node, pxm; if (srat_disabled()) @@ -154,7 +155,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) goto out_err_bad_srat; if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) goto out_err; - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) + hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; + if (hotpluggable && !save_add_info()) goto out_err; start = ma->base_address; @@ -174,9 +176,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", - node, pxm, - (unsigned long long) start, (unsigned long long) end - 1); + pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n", + node, pxm, + (unsigned long long) start, (unsigned long long) end - 1, + hotpluggable ? " hotplug" : ""); return 0; out_err_bad_srat: diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 282375f13c7..ae699b3bbac 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -103,6 +103,7 @@ static void flush_tlb_func(void *info) if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; + count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { if (f->flush_end == TLB_FLUSH_ALL) local_flush_tlb(); @@ -130,6 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, info.flush_start = start; info.flush_end = end; + count_vm_event(NR_TLB_REMOTE_FLUSH); if (is_uv_system()) { unsigned int cpu; @@ -149,6 +151,7 @@ void flush_tlb_current_task(void) preempt_disable(); + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); @@ -211,16 +214,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; /* tlb_flushall_shift is on balance point, details in commit log */ - if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) + if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); - else { + } else { if (has_large_page(mm, start, end)) { local_flush_tlb(); goto flush_all; } /* flush range by one by one 'invlpg' */ - for (addr = start; addr < end; addr += PAGE_SIZE) + for (addr = start; addr < end; addr += PAGE_SIZE) { + count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); + } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) @@ -256,6 +262,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) static void do_flush_tlb_all(void *info) { + count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(smp_processor_id()); @@ -263,6 +270,7 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { + count_vm_event(NR_TLB_REMOTE_FLUSH); on_each_cpu(do_flush_tlb_all, NULL, 1); } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 79c216aa0e2..516593e1ce3 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -772,13 +772,21 @@ out: return; } +static void bpf_jit_free_deferred(struct work_struct *work) +{ + struct sk_filter *fp = container_of(work, struct sk_filter, work); + unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; + struct bpf_binary_header *header = (void *)addr; + + set_memory_rw(addr, header->pages); + module_free(NULL, header); + kfree(fp); +} + void bpf_jit_free(struct sk_filter *fp) { if (fp->bpf_func != sk_run_filter) { - unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; - struct bpf_binary_header *header = (void *)addr; - - set_memory_rw(addr, header->pages); - module_free(NULL, header); + INIT_WORK(&fp->work, bpf_jit_free_deferred); + schedule_work(&fp->work); } } diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index d6aa6e8315d..5d04be5efb6 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -47,7 +47,7 @@ dump_user_backtrace_32(struct stack_frame_ia32 *head) unsigned long bytes; bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead)); - if (bytes != sizeof(bufhead)) + if (bytes != 0) return NULL; fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame); @@ -93,7 +93,7 @@ static struct stack_frame *dump_user_backtrace(struct stack_frame *head) unsigned long bytes; bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead)); - if (bytes != sizeof(bufhead)) + if (bytes != 0) return NULL; oprofile_add_trace(bufhead[0].return_address); diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 48768df2471..6890d8498e0 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -403,7 +403,7 @@ static void nmi_cpu_down(void *dummy) nmi_cpu_shutdown(dummy); } -static int nmi_create_files(struct super_block *sb, struct dentry *root) +static int nmi_create_files(struct dentry *root) { unsigned int i; @@ -420,14 +420,14 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) continue; snprintf(buf, sizeof(buf), "%d", i); - dir = oprofilefs_mkdir(sb, root, buf); - oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); - oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); - oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); - oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); - oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); - oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); - oprofilefs_create_ulong(sb, dir, "extra", &counter_config[i].extra); + dir = oprofilefs_mkdir(root, buf); + oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled); + oprofilefs_create_ulong(dir, "event", &counter_config[i].event); + oprofilefs_create_ulong(dir, "count", &counter_config[i].count); + oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask); + oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel); + oprofilefs_create_ulong(dir, "user", &counter_config[i].user); + oprofilefs_create_ulong(dir, "extra", &counter_config[i].extra); } return 0; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index b2b94438ff0..50d86c0e9ba 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -454,16 +454,16 @@ static void init_ibs(void) printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); } -static int (*create_arch_files)(struct super_block *sb, struct dentry *root); +static int (*create_arch_files)(struct dentry *root); -static int setup_ibs_files(struct super_block *sb, struct dentry *root) +static int setup_ibs_files(struct dentry *root) { struct dentry *dir; int ret = 0; /* architecture specific files */ if (create_arch_files) - ret = create_arch_files(sb, root); + ret = create_arch_files(root); if (ret) return ret; @@ -479,26 +479,26 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) ibs_config.max_cnt_op = 250000; if (ibs_caps & IBS_CAPS_FETCHSAM) { - dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); - oprofilefs_create_ulong(sb, dir, "enable", + dir = oprofilefs_mkdir(root, "ibs_fetch"); + oprofilefs_create_ulong(dir, "enable", &ibs_config.fetch_enabled); - oprofilefs_create_ulong(sb, dir, "max_count", + oprofilefs_create_ulong(dir, "max_count", &ibs_config.max_cnt_fetch); - oprofilefs_create_ulong(sb, dir, "rand_enable", + oprofilefs_create_ulong(dir, "rand_enable", &ibs_config.rand_en); } if (ibs_caps & IBS_CAPS_OPSAM) { - dir = oprofilefs_mkdir(sb, root, "ibs_op"); - oprofilefs_create_ulong(sb, dir, "enable", + dir = oprofilefs_mkdir(root, "ibs_op"); + oprofilefs_create_ulong(dir, "enable", &ibs_config.op_enabled); - oprofilefs_create_ulong(sb, dir, "max_count", + oprofilefs_create_ulong(dir, "max_count", &ibs_config.max_cnt_op); if (ibs_caps & IBS_CAPS_OPCNT) - oprofilefs_create_ulong(sb, dir, "dispatched_ops", + oprofilefs_create_ulong(dir, "dispatched_ops", &ibs_config.dispatched_ops); if (ibs_caps & IBS_CAPS_BRNTRGT) - oprofilefs_create_ulong(sb, dir, "branch_target", + oprofilefs_create_ulong(dir, "branch_target", &ibs_config.branch_target); } diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index ee0af58ca5b..e063eed0f91 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -18,7 +18,7 @@ obj-$(CONFIG_X86_VISWS) += visws.o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o obj-$(CONFIG_X86_NUMACHIP) += numachip.o -obj-$(CONFIG_X86_INTEL_MID) += mrst.o +obj-$(CONFIG_X86_INTEL_MID) += intel_mid_pci.o obj-y += common.o early.o obj-y += bus_numa.o diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index d641897a1f4..b30e937689d 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -568,13 +568,8 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) */ if (bus) { struct pci_bus *child; - list_for_each_entry(child, &bus->children, node) { - struct pci_dev *self = child->self; - if (!self) - continue; - - pcie_bus_configure_settings(child, self->pcie_mpss); - } + list_for_each_entry(child, &bus->children, node) + pcie_bus_configure_settings(child); } if (bus && node != -1) { diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 94919e307f8..db6b1ab4325 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -210,6 +210,8 @@ static void pcibios_allocate_bridge_resources(struct pci_dev *dev) r = &dev->resource[idx]; if (!r->flags) continue; + if (r->parent) /* Already allocated */ + continue; if (!r->start || pci_claim_resource(dev, idx) < 0) { /* * Something is wrong with the region. @@ -318,6 +320,8 @@ static void pcibios_allocate_dev_rom_resource(struct pci_dev *dev) r = &dev->resource[PCI_ROM_RESOURCE]; if (!r->flags || !r->start) return; + if (r->parent) /* Already allocated */ + return; if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) { r->end -= r->start; diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/intel_mid_pci.c index 6eb18c42a28..51384ca727a 100644 --- a/arch/x86/pci/mrst.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -1,5 +1,5 @@ /* - * Moorestown PCI support + * Intel MID PCI support * Copyright (c) 2008 Intel Corporation * Jesse Barnes <jesse.barnes@intel.com> * @@ -23,11 +23,11 @@ #include <linux/ioport.h> #include <linux/init.h> #include <linux/dmi.h> +#include <linux/acpi.h> +#include <linux/io.h> +#include <linux/smp.h> -#include <asm/acpi.h> #include <asm/segment.h> -#include <asm/io.h> -#include <asm/smp.h> #include <asm/pci_x86.h> #include <asm/hw_irq.h> #include <asm/io_apic.h> @@ -43,7 +43,7 @@ #define PCI_FIXED_BAR_4_SIZE 0x14 #define PCI_FIXED_BAR_5_SIZE 0x1c -static int pci_soc_mode = 0; +static int pci_soc_mode; /** * fixed_bar_cap - return the offset of the fixed BAR cap if found @@ -141,7 +141,8 @@ static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn, */ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) { - /* This is a workaround for A0 LNC bug where PCI status register does + /* + * This is a workaround for A0 LNC bug where PCI status register does * not have new CAP bit set. can not be written by SW either. * * PCI header type in real LNC indicates a single function device, this @@ -149,12 +150,12 @@ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) * shim. Therefore, use the header type in shim instead. */ if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE) - return 0; + return false; if (bus == 0 && (devfn == PCI_DEVFN(2, 0) || devfn == PCI_DEVFN(0, 0) || devfn == PCI_DEVFN(3, 0))) - return 1; - return 0; /* langwell on others */ + return true; + return false; /* Langwell on others */ } static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, @@ -172,7 +173,8 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, { int offset; - /* On MRST, there is no PCI ROM BAR, this will cause a subsequent read + /* + * On MRST, there is no PCI ROM BAR, this will cause a subsequent read * to ROM BAR return 0 then being ignored. */ if (where == PCI_ROM_ADDRESS) @@ -203,14 +205,15 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, where, size, value); } -static int mrst_pci_irq_enable(struct pci_dev *dev) +static int intel_mid_pci_irq_enable(struct pci_dev *dev) { u8 pin; struct io_apic_irq_attr irq_attr; pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); - /* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to + /* + * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to * IOAPIC RTE entries, so we just enable RTE for the device. */ irq_attr.ioapic = mp_find_ioapic(dev->irq); @@ -222,39 +225,43 @@ static int mrst_pci_irq_enable(struct pci_dev *dev) return 0; } -struct pci_ops pci_mrst_ops = { +struct pci_ops intel_mid_pci_ops = { .read = pci_read, .write = pci_write, }; /** - * pci_mrst_init - installs pci_mrst_ops + * intel_mid_pci_init - installs intel_mid_pci_ops * * Moorestown has an interesting PCI implementation (see above). * Called when the early platform detection installs it. */ -int __init pci_mrst_init(void) +int __init intel_mid_pci_init(void) { - printk(KERN_INFO "Intel MID platform detected, using MID PCI ops\n"); + pr_info("Intel MID platform detected, using MID PCI ops\n"); pci_mmcfg_late_init(); - pcibios_enable_irq = mrst_pci_irq_enable; - pci_root_ops = pci_mrst_ops; + pcibios_enable_irq = intel_mid_pci_irq_enable; + pci_root_ops = intel_mid_pci_ops; pci_soc_mode = 1; /* Continue with standard init */ return 1; } -/* Langwell devices are not true pci devices, they are not subject to 10 ms - * d3 to d0 delay required by pci spec. +/* + * Langwell devices are not true PCI devices; they are not subject to 10 ms + * d3 to d0 delay required by PCI spec. */ static void pci_d3delay_fixup(struct pci_dev *dev) { - /* PCI fixups are effectively decided compile time. If we have a dual - SoC/non-SoC kernel we don't want to mangle d3 on non SoC devices */ - if (!pci_soc_mode) - return; - /* true pci devices in lincroft should allow type 1 access, the rest - * are langwell fake pci devices. + /* + * PCI fixups are effectively decided compile time. If we have a dual + * SoC/non-SoC kernel we don't want to mangle d3 on non-SoC devices. + */ + if (!pci_soc_mode) + return; + /* + * True PCI devices in Lincroft should allow type 1 access, the rest + * are Langwell fake PCI devices. */ if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID)) return; diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index 01e0231a113..20342d4c82c 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -4,7 +4,7 @@ obj-y += efi/ obj-y += geode/ obj-y += goldfish/ obj-y += iris/ -obj-y += mrst/ +obj-y += intel-mid/ obj-y += olpc/ obj-y += scx200/ obj-y += sfi/ diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile index 6db1cc4c753..b7b0b35c198 100644 --- a/arch/x86/platform/efi/Makefile +++ b/arch/x86/platform/efi/Makefile @@ -1,2 +1,3 @@ obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o +obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o diff --git a/arch/x86/platform/efi/early_printk.c b/arch/x86/platform/efi/early_printk.c new file mode 100644 index 00000000000..6599a0027b7 --- /dev/null +++ b/arch/x86/platform/efi/early_printk.c @@ -0,0 +1,191 @@ +/* + * Copyright (C) 2013 Intel Corporation; author Matt Fleming + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + */ + +#include <linux/console.h> +#include <linux/efi.h> +#include <linux/font.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <asm/setup.h> + +static const struct font_desc *font; +static u32 efi_x, efi_y; + +static __init void early_efi_clear_scanline(unsigned int y) +{ + unsigned long base, *dst; + u16 len; + + base = boot_params.screen_info.lfb_base; + len = boot_params.screen_info.lfb_linelength; + + dst = early_ioremap(base + y*len, len); + if (!dst) + return; + + memset(dst, 0, len); + early_iounmap(dst, len); +} + +static __init void early_efi_scroll_up(void) +{ + unsigned long base, *dst, *src; + u16 len; + u32 i, height; + + base = boot_params.screen_info.lfb_base; + len = boot_params.screen_info.lfb_linelength; + height = boot_params.screen_info.lfb_height; + + for (i = 0; i < height - font->height; i++) { + dst = early_ioremap(base + i*len, len); + if (!dst) + return; + + src = early_ioremap(base + (i + font->height) * len, len); + if (!src) { + early_iounmap(dst, len); + return; + } + + memmove(dst, src, len); + + early_iounmap(src, len); + early_iounmap(dst, len); + } +} + +static void early_efi_write_char(u32 *dst, unsigned char c, unsigned int h) +{ + const u32 color_black = 0x00000000; + const u32 color_white = 0x00ffffff; + const u8 *src; + u8 s8; + int m; + + src = font->data + c * font->height; + s8 = *(src + h); + + for (m = 0; m < 8; m++) { + if ((s8 >> (7 - m)) & 1) + *dst = color_white; + else + *dst = color_black; + dst++; + } +} + +static __init void +early_efi_write(struct console *con, const char *str, unsigned int num) +{ + struct screen_info *si; + unsigned long base; + unsigned int len; + const char *s; + void *dst; + + base = boot_params.screen_info.lfb_base; + si = &boot_params.screen_info; + len = si->lfb_linelength; + + while (num) { + unsigned int linemax; + unsigned int h, count = 0; + + for (s = str; *s && *s != '\n'; s++) { + if (count == num) + break; + count++; + } + + linemax = (si->lfb_width - efi_x) / font->width; + if (count > linemax) + count = linemax; + + for (h = 0; h < font->height; h++) { + unsigned int n, x; + + dst = early_ioremap(base + (efi_y + h) * len, len); + if (!dst) + return; + + s = str; + n = count; + x = efi_x; + + while (n-- > 0) { + early_efi_write_char(dst + x*4, *s, h); + x += font->width; + s++; + } + + early_iounmap(dst, len); + } + + num -= count; + efi_x += count * font->width; + str += count; + + if (num > 0 && *s == '\n') { + efi_x = 0; + efi_y += font->height; + str++; + num--; + } + + if (efi_x >= si->lfb_width) { + efi_x = 0; + efi_y += font->height; + } + + if (efi_y + font->height >= si->lfb_height) { + u32 i; + + efi_y -= font->height; + early_efi_scroll_up(); + + for (i = 0; i < font->height; i++) + early_efi_clear_scanline(efi_y + i); + } + } +} + +static __init int early_efi_setup(struct console *con, char *options) +{ + struct screen_info *si; + u16 xres, yres; + u32 i; + + si = &boot_params.screen_info; + xres = si->lfb_width; + yres = si->lfb_height; + + /* + * early_efi_write_char() implicitly assumes a framebuffer with + * 32-bits per pixel. + */ + if (si->lfb_depth != 32) + return -ENODEV; + + font = get_default_font(xres, yres, -1, -1); + if (!font) + return -ENODEV; + + efi_y = rounddown(yres, font->height) - font->height; + for (i = 0; i < (yres - efi_y) / font->height; i++) + early_efi_scroll_up(); + + return 0; +} + +struct console early_efi_console = { + .name = "earlyefi", + .write = early_efi_write, + .setup = early_efi_setup, + .flags = CON_PRINTBUFFER, + .index = -1, +}; diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 90f6ed12709..92c02344a06 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -60,19 +60,6 @@ static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 }; -struct efi __read_mostly efi = { - .mps = EFI_INVALID_TABLE_ADDR, - .acpi = EFI_INVALID_TABLE_ADDR, - .acpi20 = EFI_INVALID_TABLE_ADDR, - .smbios = EFI_INVALID_TABLE_ADDR, - .sal_systab = EFI_INVALID_TABLE_ADDR, - .boot_info = EFI_INVALID_TABLE_ADDR, - .hcdp = EFI_INVALID_TABLE_ADDR, - .uga = EFI_INVALID_TABLE_ADDR, - .uv_systab = EFI_INVALID_TABLE_ADDR, -}; -EXPORT_SYMBOL(efi); - struct efi_memory_map memmap; static struct efi efi_phys __initdata; @@ -80,6 +67,13 @@ static efi_system_table_t efi_systab __initdata; unsigned long x86_efi_facility; +static __initdata efi_config_table_type_t arch_tables[] = { +#ifdef CONFIG_X86_UV + {UV_SYSTEM_TABLE_GUID, "UVsystab", &efi.uv_systab}, +#endif + {NULL_GUID, NULL, NULL}, +}; + /* * Returns 1 if 'facility' is enabled, 0 otherwise. */ @@ -399,6 +393,8 @@ int __init efi_memblock_x86_reserve_range(void) memblock_reserve(pmap, memmap.nr_map * memmap.desc_size); + efi.memmap = &memmap; + return 0; } @@ -578,80 +574,6 @@ static int __init efi_systab_init(void *phys) return 0; } -static int __init efi_config_init(u64 tables, int nr_tables) -{ - void *config_tables, *tablep; - int i, sz; - - if (efi_enabled(EFI_64BIT)) - sz = sizeof(efi_config_table_64_t); - else - sz = sizeof(efi_config_table_32_t); - - /* - * Let's see what config tables the firmware passed to us. - */ - config_tables = early_ioremap(tables, nr_tables * sz); - if (config_tables == NULL) { - pr_err("Could not map Configuration table!\n"); - return -ENOMEM; - } - - tablep = config_tables; - pr_info(""); - for (i = 0; i < efi.systab->nr_tables; i++) { - efi_guid_t guid; - unsigned long table; - - if (efi_enabled(EFI_64BIT)) { - u64 table64; - guid = ((efi_config_table_64_t *)tablep)->guid; - table64 = ((efi_config_table_64_t *)tablep)->table; - table = table64; -#ifdef CONFIG_X86_32 - if (table64 >> 32) { - pr_cont("\n"); - pr_err("Table located above 4GB, disabling EFI.\n"); - early_iounmap(config_tables, - efi.systab->nr_tables * sz); - return -EINVAL; - } -#endif - } else { - guid = ((efi_config_table_32_t *)tablep)->guid; - table = ((efi_config_table_32_t *)tablep)->table; - } - if (!efi_guidcmp(guid, MPS_TABLE_GUID)) { - efi.mps = table; - pr_cont(" MPS=0x%lx ", table); - } else if (!efi_guidcmp(guid, ACPI_20_TABLE_GUID)) { - efi.acpi20 = table; - pr_cont(" ACPI 2.0=0x%lx ", table); - } else if (!efi_guidcmp(guid, ACPI_TABLE_GUID)) { - efi.acpi = table; - pr_cont(" ACPI=0x%lx ", table); - } else if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID)) { - efi.smbios = table; - pr_cont(" SMBIOS=0x%lx ", table); -#ifdef CONFIG_X86_UV - } else if (!efi_guidcmp(guid, UV_SYSTEM_TABLE_GUID)) { - efi.uv_systab = table; - pr_cont(" UVsystab=0x%lx ", table); -#endif - } else if (!efi_guidcmp(guid, HCDP_TABLE_GUID)) { - efi.hcdp = table; - pr_cont(" HCDP=0x%lx ", table); - } else if (!efi_guidcmp(guid, UGA_IO_PROTOCOL_GUID)) { - efi.uga = table; - pr_cont(" UGA=0x%lx ", table); - } - tablep += sz; - } - pr_cont("\n"); - early_iounmap(config_tables, efi.systab->nr_tables * sz); - return 0; -} - static int __init efi_runtime_init(void) { efi_runtime_services_t *runtime; @@ -745,7 +667,7 @@ void __init efi_init(void) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); - if (efi_config_init(efi.systab->tables, efi.systab->nr_tables)) + if (efi_config_init(arch_tables)) return; set_bit(EFI_CONFIG_TABLES, &x86_efi_facility); @@ -816,34 +738,6 @@ static void __init runtime_code_page_mkexec(void) } } -/* - * We can't ioremap data in EFI boot services RAM, because we've already mapped - * it as RAM. So, look it up in the existing EFI memory map instead. Only - * callable after efi_enter_virtual_mode and before efi_free_boot_services. - */ -void __iomem *efi_lookup_mapped_addr(u64 phys_addr) -{ - void *p; - if (WARN_ON(!memmap.map)) - return NULL; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; - u64 size = md->num_pages << EFI_PAGE_SHIFT; - u64 end = md->phys_addr + size; - if (!(md->attribute & EFI_MEMORY_RUNTIME) && - md->type != EFI_BOOT_SERVICES_CODE && - md->type != EFI_BOOT_SERVICES_DATA) - continue; - if (!md->virt_addr) - continue; - if (phys_addr >= md->phys_addr && phys_addr < end) { - phys_addr += md->virt_addr - md->phys_addr; - return (__force void __iomem *)(unsigned long)phys_addr; - } - } - return NULL; -} - void efi_memory_uc(u64 addr, unsigned long size) { unsigned long page_shift = 1UL << EFI_PAGE_SHIFT; @@ -912,10 +806,13 @@ void __init efi_enter_virtual_mode(void) for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; - if (!(md->attribute & EFI_MEMORY_RUNTIME) && - md->type != EFI_BOOT_SERVICES_CODE && - md->type != EFI_BOOT_SERVICES_DATA) - continue; + if (!(md->attribute & EFI_MEMORY_RUNTIME)) { +#ifdef CONFIG_X86_64 + if (md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) +#endif + continue; + } size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c index 90e23e7679a..76b6632d314 100644 --- a/arch/x86/platform/geode/alix.c +++ b/arch/x86/platform/geode/alix.c @@ -98,7 +98,7 @@ static struct platform_device alix_leds_dev = { .dev.platform_data = &alix_leds_data, }; -static struct __initdata platform_device *alix_devs[] = { +static struct platform_device *alix_devs[] __initdata = { &alix_buttons_dev, &alix_leds_dev, }; diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c index c2e6d53558b..aa733fba247 100644 --- a/arch/x86/platform/geode/geos.c +++ b/arch/x86/platform/geode/geos.c @@ -87,7 +87,7 @@ static struct platform_device geos_leds_dev = { .dev.platform_data = &geos_leds_data, }; -static struct __initdata platform_device *geos_devs[] = { +static struct platform_device *geos_devs[] __initdata = { &geos_buttons_dev, &geos_leds_dev, }; diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c index 646e3b5b4bb..927e38c0089 100644 --- a/arch/x86/platform/geode/net5501.c +++ b/arch/x86/platform/geode/net5501.c @@ -78,7 +78,7 @@ static struct platform_device net5501_leds_dev = { .dev.platform_data = &net5501_leds_data, }; -static struct __initdata platform_device *net5501_devs[] = { +static struct platform_device *net5501_devs[] __initdata = { &net5501_buttons_dev, &net5501_leds_dev, }; diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile new file mode 100644 index 00000000000..01cc29ea5ff --- /dev/null +++ b/arch/x86/platform/intel-mid/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o +obj-$(CONFIG_X86_INTEL_MID) += intel_mid_vrtc.o +obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_intel_mid.o +# SFI specific code +ifdef CONFIG_X86_INTEL_MID +obj-$(CONFIG_SFI) += sfi.o device_libs/ +endif diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile new file mode 100644 index 00000000000..097e7a7940d --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -0,0 +1,22 @@ +# IPC Devices +obj-y += platform_ipc.o +obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic.o +obj-$(subst m,y,$(CONFIG_SND_MFLD_MACHINE)) += platform_msic_audio.o +obj-$(subst m,y,$(CONFIG_GPIO_MSIC)) += platform_msic_gpio.o +obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_ocd.o +obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_battery.o +obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_msic_power_btn.o +obj-$(subst m,y,$(CONFIG_GPIO_INTEL_PMIC)) += platform_pmic_gpio.o +obj-$(subst m,y,$(CONFIG_INTEL_MFLD_THERMAL)) += platform_msic_thermal.o +# I2C Devices +obj-$(subst m,y,$(CONFIG_SENSORS_EMC1403)) += platform_emc1403.o +obj-$(subst m,y,$(CONFIG_SENSORS_LIS3LV02D)) += platform_lis331.o +obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_max7315.o +obj-$(subst m,y,$(CONFIG_INPUT_MPU3050)) += platform_mpu3050.o +obj-$(subst m,y,$(CONFIG_INPUT_BMA150)) += platform_bma023.o +obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o +obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o +# SPI Devices +obj-$(subst m,y,$(CONFIG_SERIAL_MRST_MAX3110)) += platform_max3111.o +# MISC Devices +obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c new file mode 100644 index 00000000000..0ae7f2ae229 --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c @@ -0,0 +1,20 @@ +/* + * platform_bma023.c: bma023 platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <asm/intel-mid.h> + +static const struct devs_id bma023_dev_id __initconst = { + .name = "bma023", + .type = SFI_DEV_TYPE_I2C, + .delay = 1, +}; + +sfi_device(bma023_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c new file mode 100644 index 00000000000..0d942c1d26d --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c @@ -0,0 +1,41 @@ +/* + * platform_emc1403.c: emc1403 platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/init.h> +#include <linux/gpio.h> +#include <linux/i2c.h> +#include <asm/intel-mid.h> + +static void __init *emc1403_platform_data(void *info) +{ + static short intr2nd_pdata; + struct i2c_board_info *i2c_info = info; + int intr = get_gpio_by_name("thermal_int"); + int intr2nd = get_gpio_by_name("thermal_alert"); + + if (intr == -1 || intr2nd == -1) + return NULL; + + i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; + intr2nd_pdata = intr2nd + INTEL_MID_IRQ_OFFSET; + + return &intr2nd_pdata; +} + +static const struct devs_id emc1403_dev_id __initconst = { + .name = "emc1403", + .type = SFI_DEV_TYPE_I2C, + .delay = 1, + .get_platform_data = &emc1403_platform_data, +}; + +sfi_device(emc1403_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c new file mode 100644 index 00000000000..a013a4834bb --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c @@ -0,0 +1,83 @@ +/* + * platform_gpio_keys.c: gpio_keys platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/input.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/gpio.h> +#include <linux/gpio_keys.h> +#include <linux/platform_device.h> +#include <asm/intel-mid.h> + +#define DEVICE_NAME "gpio-keys" + +/* + * we will search these buttons in SFI GPIO table (by name) + * and register them dynamically. Please add all possible + * buttons here, we will shrink them if no GPIO found. + */ +static struct gpio_keys_button gpio_button[] = { + {KEY_POWER, -1, 1, "power_btn", EV_KEY, 0, 3000}, + {KEY_PROG1, -1, 1, "prog_btn1", EV_KEY, 0, 20}, + {KEY_PROG2, -1, 1, "prog_btn2", EV_KEY, 0, 20}, + {SW_LID, -1, 1, "lid_switch", EV_SW, 0, 20}, + {KEY_VOLUMEUP, -1, 1, "vol_up", EV_KEY, 0, 20}, + {KEY_VOLUMEDOWN, -1, 1, "vol_down", EV_KEY, 0, 20}, + {KEY_CAMERA, -1, 1, "camera_full", EV_KEY, 0, 20}, + {KEY_CAMERA_FOCUS, -1, 1, "camera_half", EV_KEY, 0, 20}, + {SW_KEYPAD_SLIDE, -1, 1, "MagSw1", EV_SW, 0, 20}, + {SW_KEYPAD_SLIDE, -1, 1, "MagSw2", EV_SW, 0, 20}, +}; + +static struct gpio_keys_platform_data gpio_keys = { + .buttons = gpio_button, + .rep = 1, + .nbuttons = -1, /* will fill it after search */ +}; + +static struct platform_device pb_device = { + .name = DEVICE_NAME, + .id = -1, + .dev = { + .platform_data = &gpio_keys, + }, +}; + +/* + * Shrink the non-existent buttons, register the gpio button + * device if there is some + */ +static int __init pb_keys_init(void) +{ + struct gpio_keys_button *gb = gpio_button; + int i, num, good = 0; + + num = sizeof(gpio_button) / sizeof(struct gpio_keys_button); + for (i = 0; i < num; i++) { + gb[i].gpio = get_gpio_by_name(gb[i].desc); + pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, + gb[i].gpio); + if (gb[i].gpio == -1) + continue; + + if (i != good) + gb[good] = gb[i]; + good++; + } + + if (good) { + gpio_keys.nbuttons = good; + return platform_device_register(&pb_device); + } + return 0; +} +late_initcall(pb_keys_init); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_ipc.c b/arch/x86/platform/intel-mid/device_libs/platform_ipc.c new file mode 100644 index 00000000000..a84b73d6c4a --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_ipc.c @@ -0,0 +1,68 @@ +/* + * platform_ipc.c: IPC platform library file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/sfi.h> +#include <linux/gpio.h> +#include <asm/intel-mid.h> +#include "platform_ipc.h" + +void __init ipc_device_handler(struct sfi_device_table_entry *pentry, + struct devs_id *dev) +{ + struct platform_device *pdev; + void *pdata = NULL; + static struct resource res __initdata = { + .name = "IRQ", + .flags = IORESOURCE_IRQ, + }; + + pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n", + pentry->name, pentry->irq); + + /* + * We need to call platform init of IPC devices to fill misc_pdata + * structure. It will be used in msic_init for initialization. + */ + if (dev != NULL) + pdata = dev->get_platform_data(pentry); + + /* + * On Medfield the platform device creation is handled by the MSIC + * MFD driver so we don't need to do it here. + */ + if (intel_mid_has_msic()) + return; + + pdev = platform_device_alloc(pentry->name, 0); + if (pdev == NULL) { + pr_err("out of memory for SFI platform device '%s'.\n", + pentry->name); + return; + } + res.start = pentry->irq; + platform_device_add_resources(pdev, &res, 1); + + pdev->dev.platform_data = pdata; + intel_scu_device_register(pdev); +} + +static const struct devs_id pmic_audio_dev_id __initconst = { + .name = "pmic_audio", + .type = SFI_DEV_TYPE_IPC, + .delay = 1, + .device_handler = &ipc_device_handler, +}; + +sfi_device(pmic_audio_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_ipc.h b/arch/x86/platform/intel-mid/device_libs/platform_ipc.h new file mode 100644 index 00000000000..8f568dd7960 --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_ipc.h @@ -0,0 +1,17 @@ +/* + * platform_ipc.h: IPC platform library header file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#ifndef _PLATFORM_IPC_H_ +#define _PLATFORM_IPC_H_ + +extern void __init ipc_device_handler(struct sfi_device_table_entry *pentry, + struct devs_id *dev) __attribute__((weak)); +#endif diff --git a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c new file mode 100644 index 00000000000..15278c11f71 --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c @@ -0,0 +1,39 @@ +/* + * platform_lis331.c: lis331 platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/i2c.h> +#include <linux/gpio.h> +#include <asm/intel-mid.h> + +static void __init *lis331dl_platform_data(void *info) +{ + static short intr2nd_pdata; + struct i2c_board_info *i2c_info = info; + int intr = get_gpio_by_name("accel_int"); + int intr2nd = get_gpio_by_name("accel_2"); + + if (intr == -1 || intr2nd == -1) + return NULL; + + i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; + intr2nd_pdata = intr2nd + INTEL_MID_IRQ_OFFSET; + + return &intr2nd_pdata; +} + +static const struct devs_id lis331dl_dev_id __initconst = { + .name = "i2c_accel", + .type = SFI_DEV_TYPE_I2C, + .get_platform_data = &lis331dl_platform_data, +}; + +sfi_device(lis331dl_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max3111.c b/arch/x86/platform/intel-mid/device_libs/platform_max3111.c new file mode 100644 index 00000000000..afd1df94e0e --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_max3111.c @@ -0,0 +1,35 @@ +/* + * platform_max3111.c: max3111 platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/gpio.h> +#include <linux/spi/spi.h> +#include <asm/intel-mid.h> + +static void __init *max3111_platform_data(void *info) +{ + struct spi_board_info *spi_info = info; + int intr = get_gpio_by_name("max3111_int"); + + spi_info->mode = SPI_MODE_0; + if (intr == -1) + return NULL; + spi_info->irq = intr + INTEL_MID_IRQ_OFFSET; + return NULL; +} + +static const struct devs_id max3111_dev_id __initconst = { + .name = "spi_max3111", + .type = SFI_DEV_TYPE_SPI, + .get_platform_data = &max3111_platform_data, +}; + +sfi_device(max3111_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c new file mode 100644 index 00000000000..94ade10024a --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c @@ -0,0 +1,79 @@ +/* + * platform_max7315.c: max7315 platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/init.h> +#include <linux/gpio.h> +#include <linux/i2c.h> +#include <linux/platform_data/pca953x.h> +#include <asm/intel-mid.h> + +#define MAX7315_NUM 2 + +static void __init *max7315_platform_data(void *info) +{ + static struct pca953x_platform_data max7315_pdata[MAX7315_NUM]; + static int nr; + struct pca953x_platform_data *max7315 = &max7315_pdata[nr]; + struct i2c_board_info *i2c_info = info; + int gpio_base, intr; + char base_pin_name[SFI_NAME_LEN + 1]; + char intr_pin_name[SFI_NAME_LEN + 1]; + + if (nr == MAX7315_NUM) { + pr_err("too many max7315s, we only support %d\n", + MAX7315_NUM); + return NULL; + } + /* we have several max7315 on the board, we only need load several + * instances of the same pca953x driver to cover them + */ + strcpy(i2c_info->type, "max7315"); + if (nr++) { + sprintf(base_pin_name, "max7315_%d_base", nr); + sprintf(intr_pin_name, "max7315_%d_int", nr); + } else { + strcpy(base_pin_name, "max7315_base"); + strcpy(intr_pin_name, "max7315_int"); + } + + gpio_base = get_gpio_by_name(base_pin_name); + intr = get_gpio_by_name(intr_pin_name); + + if (gpio_base == -1) + return NULL; + max7315->gpio_base = gpio_base; + if (intr != -1) { + i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; + max7315->irq_base = gpio_base + INTEL_MID_IRQ_OFFSET; + } else { + i2c_info->irq = -1; + max7315->irq_base = -1; + } + return max7315; +} + +static const struct devs_id max7315_dev_id __initconst = { + .name = "i2c_max7315", + .type = SFI_DEV_TYPE_I2C, + .delay = 1, + .get_platform_data = &max7315_platform_data, +}; + +static const struct devs_id max7315_2_dev_id __initconst = { + .name = "i2c_max7315_2", + .type = SFI_DEV_TYPE_I2C, + .delay = 1, + .get_platform_data = &max7315_platform_data, +}; + +sfi_device(max7315_dev_id); +sfi_device(max7315_2_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c new file mode 100644 index 00000000000..dd28d63c84f --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c @@ -0,0 +1,36 @@ +/* + * platform_mpu3050.c: mpu3050 platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/gpio.h> +#include <linux/i2c.h> +#include <asm/intel-mid.h> + +static void *mpu3050_platform_data(void *info) +{ + struct i2c_board_info *i2c_info = info; + int intr = get_gpio_by_name("mpu3050_int"); + + if (intr == -1) + return NULL; + + i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; + return NULL; +} + +static const struct devs_id mpu3050_dev_id __initconst = { + .name = "mpu3050", + .type = SFI_DEV_TYPE_I2C, + .delay = 1, + .get_platform_data = &mpu3050_platform_data, +}; + +sfi_device(mpu3050_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.c b/arch/x86/platform/intel-mid/device_libs/platform_msic.c new file mode 100644 index 00000000000..9f4a775a69d --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic.c @@ -0,0 +1,87 @@ +/* + * platform_msic.c: MSIC platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/scatterlist.h> +#include <linux/init.h> +#include <linux/sfi.h> +#include <linux/mfd/intel_msic.h> +#include <asm/intel_scu_ipc.h> +#include <asm/intel-mid.h> +#include "platform_msic.h" + +struct intel_msic_platform_data msic_pdata; + +static struct resource msic_resources[] = { + { + .start = INTEL_MSIC_IRQ_PHYS_BASE, + .end = INTEL_MSIC_IRQ_PHYS_BASE + 64 - 1, + .flags = IORESOURCE_MEM, + }, +}; + +static struct platform_device msic_device = { + .name = "intel_msic", + .id = -1, + .dev = { + .platform_data = &msic_pdata, + }, + .num_resources = ARRAY_SIZE(msic_resources), + .resource = msic_resources, +}; + +static int msic_scu_status_change(struct notifier_block *nb, + unsigned long code, void *data) +{ + if (code == SCU_DOWN) { + platform_device_unregister(&msic_device); + return 0; + } + + return platform_device_register(&msic_device); +} + +static int __init msic_init(void) +{ + static struct notifier_block msic_scu_notifier = { + .notifier_call = msic_scu_status_change, + }; + + /* + * We need to be sure that the SCU IPC is ready before MSIC device + * can be registered. + */ + if (intel_mid_has_msic()) + intel_scu_notifier_add(&msic_scu_notifier); + + return 0; +} +arch_initcall(msic_init); + +/* + * msic_generic_platform_data - sets generic platform data for the block + * @info: pointer to the SFI device table entry for this block + * @block: MSIC block + * + * Function sets IRQ number from the SFI table entry for given device to + * the MSIC platform data. + */ +void *msic_generic_platform_data(void *info, enum intel_msic_block block) +{ + struct sfi_device_table_entry *entry = info; + + BUG_ON(block < 0 || block >= INTEL_MSIC_BLOCK_LAST); + msic_pdata.irq[block] = entry->irq; + + return NULL; +} diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.h b/arch/x86/platform/intel-mid/device_libs/platform_msic.h new file mode 100644 index 00000000000..917eb56d77d --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic.h @@ -0,0 +1,19 @@ +/* + * platform_msic.h: MSIC platform data header file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#ifndef _PLATFORM_MSIC_H_ +#define _PLATFORM_MSIC_H_ + +extern struct intel_msic_platform_data msic_pdata; + +extern void *msic_generic_platform_data(void *info, + enum intel_msic_block block) __attribute__((weak)); +#endif diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c new file mode 100644 index 00000000000..29629397d2b --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c @@ -0,0 +1,47 @@ +/* + * platform_msic_audio.c: MSIC audio platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/scatterlist.h> +#include <linux/init.h> +#include <linux/sfi.h> +#include <linux/platform_device.h> +#include <linux/mfd/intel_msic.h> +#include <asm/intel-mid.h> + +#include "platform_msic.h" +#include "platform_ipc.h" + +static void *msic_audio_platform_data(void *info) +{ + struct platform_device *pdev; + + pdev = platform_device_register_simple("sst-platform", -1, NULL, 0); + + if (IS_ERR(pdev)) { + pr_err("failed to create audio platform device\n"); + return NULL; + } + + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_AUDIO); +} + +static const struct devs_id msic_audio_dev_id __initconst = { + .name = "msic_audio", + .type = SFI_DEV_TYPE_IPC, + .delay = 1, + .get_platform_data = &msic_audio_platform_data, + .device_handler = &ipc_device_handler, +}; + +sfi_device(msic_audio_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c new file mode 100644 index 00000000000..f446c33df1a --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c @@ -0,0 +1,37 @@ +/* + * platform_msic_battery.c: MSIC battery platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/scatterlist.h> +#include <linux/init.h> +#include <linux/sfi.h> +#include <linux/mfd/intel_msic.h> +#include <asm/intel-mid.h> + +#include "platform_msic.h" +#include "platform_ipc.h" + +static void __init *msic_battery_platform_data(void *info) +{ + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_BATTERY); +} + +static const struct devs_id msic_battery_dev_id __initconst = { + .name = "msic_battery", + .type = SFI_DEV_TYPE_IPC, + .delay = 1, + .get_platform_data = &msic_battery_platform_data, + .device_handler = &ipc_device_handler, +}; + +sfi_device(msic_battery_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c new file mode 100644 index 00000000000..2a4f7b1dd91 --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c @@ -0,0 +1,48 @@ +/* + * platform_msic_gpio.c: MSIC GPIO platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/scatterlist.h> +#include <linux/sfi.h> +#include <linux/init.h> +#include <linux/gpio.h> +#include <linux/mfd/intel_msic.h> +#include <asm/intel-mid.h> + +#include "platform_msic.h" +#include "platform_ipc.h" + +static void __init *msic_gpio_platform_data(void *info) +{ + static struct intel_msic_gpio_pdata msic_gpio_pdata; + + int gpio = get_gpio_by_name("msic_gpio_base"); + + if (gpio < 0) + return NULL; + + msic_gpio_pdata.gpio_base = gpio; + msic_pdata.gpio = &msic_gpio_pdata; + + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_GPIO); +} + +static const struct devs_id msic_gpio_dev_id __initconst = { + .name = "msic_gpio", + .type = SFI_DEV_TYPE_IPC, + .delay = 1, + .get_platform_data = &msic_gpio_platform_data, + .device_handler = &ipc_device_handler, +}; + +sfi_device(msic_gpio_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c new file mode 100644 index 00000000000..6497111ddb5 --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c @@ -0,0 +1,49 @@ +/* + * platform_msic_ocd.c: MSIC OCD platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/scatterlist.h> +#include <linux/sfi.h> +#include <linux/init.h> +#include <linux/gpio.h> +#include <linux/mfd/intel_msic.h> +#include <asm/intel-mid.h> + +#include "platform_msic.h" +#include "platform_ipc.h" + +static void __init *msic_ocd_platform_data(void *info) +{ + static struct intel_msic_ocd_pdata msic_ocd_pdata; + int gpio; + + gpio = get_gpio_by_name("ocd_gpio"); + + if (gpio < 0) + return NULL; + + msic_ocd_pdata.gpio = gpio; + msic_pdata.ocd = &msic_ocd_pdata; + + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_OCD); +} + +static const struct devs_id msic_ocd_dev_id __initconst = { + .name = "msic_ocd", + .type = SFI_DEV_TYPE_IPC, + .delay = 1, + .get_platform_data = &msic_ocd_platform_data, + .device_handler = &ipc_device_handler, +}; + +sfi_device(msic_ocd_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c new file mode 100644 index 00000000000..83a3459bc33 --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c @@ -0,0 +1,36 @@ +/* + * platform_msic_power_btn.c: MSIC power btn platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/scatterlist.h> +#include <linux/sfi.h> +#include <linux/init.h> +#include <linux/mfd/intel_msic.h> +#include <asm/intel-mid.h> + +#include "platform_msic.h" +#include "platform_ipc.h" + +static void __init *msic_power_btn_platform_data(void *info) +{ + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_POWER_BTN); +} + +static const struct devs_id msic_power_btn_dev_id __initconst = { + .name = "msic_power_btn", + .type = SFI_DEV_TYPE_IPC, + .delay = 1, + .get_platform_data = &msic_power_btn_platform_data, + .device_handler = &ipc_device_handler, +}; + +sfi_device(msic_power_btn_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c new file mode 100644 index 00000000000..a351878b96b --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c @@ -0,0 +1,37 @@ +/* + * platform_msic_thermal.c: msic_thermal platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/input.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/gpio.h> +#include <linux/platform_device.h> +#include <linux/mfd/intel_msic.h> +#include <asm/intel-mid.h> + +#include "platform_msic.h" +#include "platform_ipc.h" + +static void __init *msic_thermal_platform_data(void *info) +{ + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_THERMAL); +} + +static const struct devs_id msic_thermal_dev_id __initconst = { + .name = "msic_thermal", + .type = SFI_DEV_TYPE_IPC, + .delay = 1, + .get_platform_data = &msic_thermal_platform_data, + .device_handler = &ipc_device_handler, +}; + +sfi_device(msic_thermal_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c new file mode 100644 index 00000000000..d87182a0926 --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c @@ -0,0 +1,54 @@ +/* + * platform_pmic_gpio.c: PMIC GPIO platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/scatterlist.h> +#include <linux/gpio.h> +#include <linux/init.h> +#include <linux/sfi.h> +#include <linux/intel_pmic_gpio.h> +#include <asm/intel-mid.h> + +#include "platform_ipc.h" + +static void __init *pmic_gpio_platform_data(void *info) +{ + static struct intel_pmic_gpio_platform_data pmic_gpio_pdata; + int gpio_base = get_gpio_by_name("pmic_gpio_base"); + + if (gpio_base == -1) + gpio_base = 64; + pmic_gpio_pdata.gpio_base = gpio_base; + pmic_gpio_pdata.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET; + pmic_gpio_pdata.gpiointr = 0xffffeff8; + + return &pmic_gpio_pdata; +} + +static const struct devs_id pmic_gpio_spi_dev_id __initconst = { + .name = "pmic_gpio", + .type = SFI_DEV_TYPE_SPI, + .delay = 1, + .get_platform_data = &pmic_gpio_platform_data, +}; + +static const struct devs_id pmic_gpio_ipc_dev_id __initconst = { + .name = "pmic_gpio", + .type = SFI_DEV_TYPE_IPC, + .delay = 1, + .get_platform_data = &pmic_gpio_platform_data, + .device_handler = &ipc_device_handler +}; + +sfi_device(pmic_gpio_spi_dev_id); +sfi_device(pmic_gpio_ipc_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c new file mode 100644 index 00000000000..740fc757050 --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c @@ -0,0 +1,36 @@ +/* + * platform_tc35876x.c: tc35876x platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/gpio.h> +#include <linux/i2c/tc35876x.h> +#include <asm/intel-mid.h> + +/*tc35876x DSI_LVDS bridge chip and panel platform data*/ +static void *tc35876x_platform_data(void *data) +{ + static struct tc35876x_platform_data pdata; + + /* gpio pins set to -1 will not be used by the driver */ + pdata.gpio_bridge_reset = get_gpio_by_name("LCMB_RXEN"); + pdata.gpio_panel_bl_en = get_gpio_by_name("6S6P_BL_EN"); + pdata.gpio_panel_vadd = get_gpio_by_name("EN_VREG_LCD_V3P3"); + + return &pdata; +} + +static const struct devs_id tc35876x_dev_id __initconst = { + .name = "i2c_disp_brig", + .type = SFI_DEV_TYPE_I2C, + .get_platform_data = &tc35876x_platform_data, +}; + +sfi_device(tc35876x_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c new file mode 100644 index 00000000000..22881c9a673 --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c @@ -0,0 +1,57 @@ +/* + * platform_tca6416.c: tca6416 platform data initilization file + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/platform_data/pca953x.h> +#include <linux/i2c.h> +#include <linux/gpio.h> +#include <asm/intel-mid.h> + +#define TCA6416_NAME "tca6416" +#define TCA6416_BASE "tca6416_base" +#define TCA6416_INTR "tca6416_int" + +static void *tca6416_platform_data(void *info) +{ + static struct pca953x_platform_data tca6416; + struct i2c_board_info *i2c_info = info; + int gpio_base, intr; + char base_pin_name[SFI_NAME_LEN + 1]; + char intr_pin_name[SFI_NAME_LEN + 1]; + + strcpy(i2c_info->type, TCA6416_NAME); + strcpy(base_pin_name, TCA6416_BASE); + strcpy(intr_pin_name, TCA6416_INTR); + + gpio_base = get_gpio_by_name(base_pin_name); + intr = get_gpio_by_name(intr_pin_name); + + if (gpio_base == -1) + return NULL; + tca6416.gpio_base = gpio_base; + if (intr != -1) { + i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; + tca6416.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET; + } else { + i2c_info->irq = -1; + tca6416.irq_base = -1; + } + return &tca6416; +} + +static const struct devs_id tca6416_dev_id __initconst = { + .name = "tca6416", + .type = SFI_DEV_TYPE_I2C, + .delay = 1, + .get_platform_data = &tca6416_platform_data, +}; + +sfi_device(tca6416_dev_id); diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/intel-mid/early_printk_intel_mid.c index 028454f0c3a..4f702f554f6 100644 --- a/arch/x86/platform/mrst/early_printk_mrst.c +++ b/arch/x86/platform/intel-mid/early_printk_intel_mid.c @@ -1,5 +1,5 @@ /* - * early_printk_mrst.c - early consoles for Intel MID platforms + * early_printk_intel_mid.c - early consoles for Intel MID platforms * * Copyright (c) 2008-2010, Intel Corporation * @@ -27,7 +27,7 @@ #include <asm/fixmap.h> #include <asm/pgtable.h> -#include <asm/mrst.h> +#include <asm/intel-mid.h> #define MRST_SPI_TIMEOUT 0x200000 #define MRST_REGBASE_SPI0 0xff128000 @@ -152,7 +152,7 @@ void mrst_early_console_init(void) spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9; freq = 100000000 / (spi0_cdiv + 1); - if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL) + if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_PENWELL) mrst_spi_paddr = MRST_REGBASE_SPI1; pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, @@ -213,13 +213,14 @@ static void early_mrst_spi_putc(char c) } if (!timeout) - pr_warning("MRST earlycon: timed out\n"); + pr_warn("MRST earlycon: timed out\n"); else max3110_write_data(c); } /* Early SPI only uses polling mode */ -static void early_mrst_spi_write(struct console *con, const char *str, unsigned n) +static void early_mrst_spi_write(struct console *con, const char *str, + unsigned n) { int i; diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c new file mode 100644 index 00000000000..f90e290f689 --- /dev/null +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -0,0 +1,213 @@ +/* + * intel-mid.c: Intel MID platform setup code + * + * (C) Copyright 2008, 2012 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#define pr_fmt(fmt) "intel_mid: " fmt + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/scatterlist.h> +#include <linux/sfi.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/notifier.h> + +#include <asm/setup.h> +#include <asm/mpspec_def.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/intel-mid.h> +#include <asm/intel_mid_vrtc.h> +#include <asm/io.h> +#include <asm/i8259.h> +#include <asm/intel_scu_ipc.h> +#include <asm/apb_timer.h> +#include <asm/reboot.h> + +/* + * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, + * cmdline option x86_intel_mid_timer can be used to override the configuration + * to prefer one or the other. + * at runtime, there are basically three timer configurations: + * 1. per cpu apbt clock only + * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only + * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast. + * + * by default (without cmdline option), platform code first detects cpu type + * to see if we are on lincroft or penwell, then set up both lapic or apbt + * clocks accordingly. + * i.e. by default, medfield uses configuration #2, moorestown uses #1. + * config #3 is supported but not recommended on medfield. + * + * rating and feature summary: + * lapic (with C3STOP) --------- 100 + * apbt (always-on) ------------ 110 + * lapic (always-on,ARAT) ------ 150 + */ + +enum intel_mid_timer_options intel_mid_timer_options; + +enum intel_mid_cpu_type __intel_mid_cpu_chip; +EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip); + +static void intel_mid_power_off(void) +{ +} + +static void intel_mid_reboot(void) +{ + intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0); +} + +static unsigned long __init intel_mid_calibrate_tsc(void) +{ + unsigned long fast_calibrate; + u32 lo, hi, ratio, fsb; + + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi); + ratio = (hi >> 8) & 0x1f; + pr_debug("ratio is %d\n", ratio); + if (!ratio) { + pr_err("read a zero ratio, should be incorrect!\n"); + pr_err("force tsc ratio to 16 ...\n"); + ratio = 16; + } + rdmsr(MSR_FSB_FREQ, lo, hi); + if ((lo & 0x7) == 0x7) + fsb = PENWELL_FSB_FREQ_83SKU; + else + fsb = PENWELL_FSB_FREQ_100SKU; + fast_calibrate = ratio * fsb; + pr_debug("read penwell tsc %lu khz\n", fast_calibrate); + lapic_timer_frequency = fsb * 1000 / HZ; + /* mark tsc clocksource as reliable */ + set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); + + if (fast_calibrate) + return fast_calibrate; + + return 0; +} + +static void __init intel_mid_time_init(void) +{ + sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); + switch (intel_mid_timer_options) { + case INTEL_MID_TIMER_APBT_ONLY: + break; + case INTEL_MID_TIMER_LAPIC_APBT: + x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; + break; + default: + if (!boot_cpu_has(X86_FEATURE_ARAT)) + break; + x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; + return; + } + /* we need at least one APB timer */ + pre_init_apic_IRQ0(); + apbt_time_init(); +} + +static void intel_mid_arch_setup(void) +{ + if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) + __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL; + else { + pr_err("Unknown Intel MID CPU (%d:%d), default to Penwell\n", + boot_cpu_data.x86, boot_cpu_data.x86_model); + __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL; + } +} + +/* MID systems don't have i8042 controller */ +static int intel_mid_i8042_detect(void) +{ + return 0; +} + +/* + * Moorestown does not have external NMI source nor port 0x61 to report + * NMI status. The possible NMI sources are from pmu as a result of NMI + * watchdog or lock debug. Reading io port 0x61 results in 0xff which + * misled NMI handler. + */ +static unsigned char intel_mid_get_nmi_reason(void) +{ + return 0; +} + +/* + * Moorestown specific x86_init function overrides and early setup + * calls. + */ +void __init x86_intel_mid_early_setup(void) +{ + x86_init.resources.probe_roms = x86_init_noop; + x86_init.resources.reserve_resources = x86_init_noop; + + x86_init.timers.timer_init = intel_mid_time_init; + x86_init.timers.setup_percpu_clockev = x86_init_noop; + + x86_init.irqs.pre_vector_init = x86_init_noop; + + x86_init.oem.arch_setup = intel_mid_arch_setup; + + x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; + + x86_platform.calibrate_tsc = intel_mid_calibrate_tsc; + x86_platform.i8042_detect = intel_mid_i8042_detect; + x86_init.timers.wallclock_init = intel_mid_rtc_init; + x86_platform.get_nmi_reason = intel_mid_get_nmi_reason; + + x86_init.pci.init = intel_mid_pci_init; + x86_init.pci.fixup_irqs = x86_init_noop; + + legacy_pic = &null_legacy_pic; + + pm_power_off = intel_mid_power_off; + machine_ops.emergency_restart = intel_mid_reboot; + + /* Avoid searching for BIOS MP tables */ + x86_init.mpparse.find_smp_config = x86_init_noop; + x86_init.mpparse.get_smp_config = x86_init_uint_noop; + set_bit(MP_BUS_ISA, mp_bus_not_pci); +} + +/* + * if user does not want to use per CPU apb timer, just give it a lower rating + * than local apic timer and skip the late per cpu timer init. + */ +static inline int __init setup_x86_intel_mid_timer(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp("apbt_only", arg) == 0) + intel_mid_timer_options = INTEL_MID_TIMER_APBT_ONLY; + else if (strcmp("lapic_and_apbt", arg) == 0) + intel_mid_timer_options = INTEL_MID_TIMER_LAPIC_APBT; + else { + pr_warn("X86 INTEL_MID timer option %s not recognised" + " use x86_intel_mid_timer=apbt_only or lapic_and_apbt\n", + arg); + return -EINVAL; + } + return 0; +} +__setup("x86_intel_mid_timer=", setup_x86_intel_mid_timer); + diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c index 5e355b134ba..4762cff7fac 100644 --- a/arch/x86/platform/mrst/vrtc.c +++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c @@ -1,5 +1,5 @@ /* - * vrtc.c: Driver for virtual RTC device on Intel MID platform + * intel_mid_vrtc.c: Driver for virtual RTC device on Intel MID platform * * (C) Copyright 2009 Intel Corporation * @@ -23,8 +23,8 @@ #include <linux/sfi.h> #include <linux/platform_device.h> -#include <asm/mrst.h> -#include <asm/mrst-vrtc.h> +#include <asm/intel-mid.h> +#include <asm/intel_mid_vrtc.h> #include <asm/time.h> #include <asm/fixmap.h> @@ -79,7 +79,7 @@ void vrtc_get_time(struct timespec *now) /* vRTC YEAR reg contains the offset to 1972 */ year += 1972; - printk(KERN_INFO "vRTC: sec: %d min: %d hour: %d day: %d " + pr_info("vRTC: sec: %d min: %d hour: %d day: %d " "mon: %d year: %d\n", sec, min, hour, mday, mon, year); now->tv_sec = mktime(year, mon, mday, hour, min, sec); @@ -109,15 +109,14 @@ int vrtc_set_mmss(const struct timespec *now) vrtc_cmos_write(tm.tm_sec, RTC_SECONDS); spin_unlock_irqrestore(&rtc_lock, flags); } else { - printk(KERN_ERR - "%s: Invalid vRTC value: write of %lx to vRTC failed\n", + pr_err("%s: Invalid vRTC value: write of %lx to vRTC failed\n", __FUNCTION__, now->tv_sec); retval = -EINVAL; } return retval; } -void __init mrst_rtc_init(void) +void __init intel_mid_rtc_init(void) { unsigned long vrtc_paddr; @@ -155,10 +154,10 @@ static struct platform_device vrtc_device = { }; /* Register the RTC device if appropriate */ -static int __init mrst_device_create(void) +static int __init intel_mid_device_create(void) { /* No Moorestown, no device */ - if (!mrst_identify_cpu()) + if (!intel_mid_identify_cpu()) return -ENODEV; /* No timer, no device */ if (!sfi_mrtc_num) @@ -175,4 +174,4 @@ static int __init mrst_device_create(void) return platform_device_register(&vrtc_device); } -module_init(mrst_device_create); +module_init(intel_mid_device_create); diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c new file mode 100644 index 00000000000..c84c1ca396b --- /dev/null +++ b/arch/x86/platform/intel-mid/sfi.c @@ -0,0 +1,488 @@ +/* + * intel_mid_sfi.c: Intel MID SFI initialization code + * + * (C) Copyright 2013 Intel Corporation + * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/scatterlist.h> +#include <linux/sfi.h> +#include <linux/intel_pmic_gpio.h> +#include <linux/spi/spi.h> +#include <linux/i2c.h> +#include <linux/skbuff.h> +#include <linux/gpio.h> +#include <linux/gpio_keys.h> +#include <linux/input.h> +#include <linux/platform_device.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/mmc/core.h> +#include <linux/mmc/card.h> +#include <linux/blkdev.h> + +#include <asm/setup.h> +#include <asm/mpspec_def.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/intel-mid.h> +#include <asm/intel_mid_vrtc.h> +#include <asm/io.h> +#include <asm/i8259.h> +#include <asm/intel_scu_ipc.h> +#include <asm/apb_timer.h> +#include <asm/reboot.h> + +#define SFI_SIG_OEM0 "OEM0" +#define MAX_IPCDEVS 24 +#define MAX_SCU_SPI 24 +#define MAX_SCU_I2C 24 + +static struct platform_device *ipc_devs[MAX_IPCDEVS]; +static struct spi_board_info *spi_devs[MAX_SCU_SPI]; +static struct i2c_board_info *i2c_devs[MAX_SCU_I2C]; +static struct sfi_gpio_table_entry *gpio_table; +static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; +static int ipc_next_dev; +static int spi_next_dev; +static int i2c_next_dev; +static int i2c_bus[MAX_SCU_I2C]; +static int gpio_num_entry; +static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; +int sfi_mrtc_num; +int sfi_mtimer_num; + +struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; +EXPORT_SYMBOL_GPL(sfi_mrtc_array); + +struct blocking_notifier_head intel_scu_notifier = + BLOCKING_NOTIFIER_INIT(intel_scu_notifier); +EXPORT_SYMBOL_GPL(intel_scu_notifier); + +#define intel_mid_sfi_get_pdata(dev, priv) \ + ((dev)->get_platform_data ? (dev)->get_platform_data(priv) : NULL) + +/* parse all the mtimer info to a static mtimer array */ +int __init sfi_parse_mtmr(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_timer_table_entry *pentry; + struct mpc_intsrc mp_irq; + int totallen; + + sb = (struct sfi_table_simple *)table; + if (!sfi_mtimer_num) { + sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb, + struct sfi_timer_table_entry); + pentry = (struct sfi_timer_table_entry *) sb->pentry; + totallen = sfi_mtimer_num * sizeof(*pentry); + memcpy(sfi_mtimer_array, pentry, totallen); + } + + pr_debug("SFI MTIMER info (num = %d):\n", sfi_mtimer_num); + pentry = sfi_mtimer_array; + for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) { + pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz, irq = %d\n", + totallen, (u32)pentry->phys_addr, + pentry->freq_hz, pentry->irq); + if (!pentry->irq) + continue; + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; +/* triggering mode edge bit 2-3, active high polarity bit 0-1 */ + mp_irq.irqflag = 5; + mp_irq.srcbus = MP_BUS_ISA; + mp_irq.srcbusirq = pentry->irq; /* IRQ */ + mp_irq.dstapic = MP_APIC_ALL; + mp_irq.dstirq = pentry->irq; + mp_save_irq(&mp_irq); + } + + return 0; +} + +struct sfi_timer_table_entry *sfi_get_mtmr(int hint) +{ + int i; + if (hint < sfi_mtimer_num) { + if (!sfi_mtimer_usage[hint]) { + pr_debug("hint taken for timer %d irq %d\n", + hint, sfi_mtimer_array[hint].irq); + sfi_mtimer_usage[hint] = 1; + return &sfi_mtimer_array[hint]; + } + } + /* take the first timer available */ + for (i = 0; i < sfi_mtimer_num;) { + if (!sfi_mtimer_usage[i]) { + sfi_mtimer_usage[i] = 1; + return &sfi_mtimer_array[i]; + } + i++; + } + return NULL; +} + +void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr) +{ + int i; + for (i = 0; i < sfi_mtimer_num;) { + if (mtmr->irq == sfi_mtimer_array[i].irq) { + sfi_mtimer_usage[i] = 0; + return; + } + i++; + } +} + +/* parse all the mrtc info to a global mrtc array */ +int __init sfi_parse_mrtc(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_rtc_table_entry *pentry; + struct mpc_intsrc mp_irq; + + int totallen; + + sb = (struct sfi_table_simple *)table; + if (!sfi_mrtc_num) { + sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb, + struct sfi_rtc_table_entry); + pentry = (struct sfi_rtc_table_entry *)sb->pentry; + totallen = sfi_mrtc_num * sizeof(*pentry); + memcpy(sfi_mrtc_array, pentry, totallen); + } + + pr_debug("SFI RTC info (num = %d):\n", sfi_mrtc_num); + pentry = sfi_mrtc_array; + for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { + pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n", + totallen, (u32)pentry->phys_addr, pentry->irq); + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = 0xf; /* level trigger and active low */ + mp_irq.srcbus = MP_BUS_ISA; + mp_irq.srcbusirq = pentry->irq; /* IRQ */ + mp_irq.dstapic = MP_APIC_ALL; + mp_irq.dstirq = pentry->irq; + mp_save_irq(&mp_irq); + } + return 0; +} + + +/* + * Parsing GPIO table first, since the DEVS table will need this table + * to map the pin name to the actual pin. + */ +static int __init sfi_parse_gpio(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_gpio_table_entry *pentry; + int num, i; + + if (gpio_table) + return 0; + sb = (struct sfi_table_simple *)table; + num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry); + pentry = (struct sfi_gpio_table_entry *)sb->pentry; + + gpio_table = kmalloc(num * sizeof(*pentry), GFP_KERNEL); + if (!gpio_table) + return -1; + memcpy(gpio_table, pentry, num * sizeof(*pentry)); + gpio_num_entry = num; + + pr_debug("GPIO pin info:\n"); + for (i = 0; i < num; i++, pentry++) + pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s," + " pin = %d\n", i, + pentry->controller_name, + pentry->pin_name, + pentry->pin_no); + return 0; +} + +int get_gpio_by_name(const char *name) +{ + struct sfi_gpio_table_entry *pentry = gpio_table; + int i; + + if (!pentry) + return -1; + for (i = 0; i < gpio_num_entry; i++, pentry++) { + if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN)) + return pentry->pin_no; + } + return -1; +} + +void __init intel_scu_device_register(struct platform_device *pdev) +{ + if (ipc_next_dev == MAX_IPCDEVS) + pr_err("too many SCU IPC devices"); + else + ipc_devs[ipc_next_dev++] = pdev; +} + +static void __init intel_scu_spi_device_register(struct spi_board_info *sdev) +{ + struct spi_board_info *new_dev; + + if (spi_next_dev == MAX_SCU_SPI) { + pr_err("too many SCU SPI devices"); + return; + } + + new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL); + if (!new_dev) { + pr_err("failed to alloc mem for delayed spi dev %s\n", + sdev->modalias); + return; + } + memcpy(new_dev, sdev, sizeof(*sdev)); + + spi_devs[spi_next_dev++] = new_dev; +} + +static void __init intel_scu_i2c_device_register(int bus, + struct i2c_board_info *idev) +{ + struct i2c_board_info *new_dev; + + if (i2c_next_dev == MAX_SCU_I2C) { + pr_err("too many SCU I2C devices"); + return; + } + + new_dev = kzalloc(sizeof(*idev), GFP_KERNEL); + if (!new_dev) { + pr_err("failed to alloc mem for delayed i2c dev %s\n", + idev->type); + return; + } + memcpy(new_dev, idev, sizeof(*idev)); + + i2c_bus[i2c_next_dev] = bus; + i2c_devs[i2c_next_dev++] = new_dev; +} + +/* Called by IPC driver */ +void intel_scu_devices_create(void) +{ + int i; + + for (i = 0; i < ipc_next_dev; i++) + platform_device_add(ipc_devs[i]); + + for (i = 0; i < spi_next_dev; i++) + spi_register_board_info(spi_devs[i], 1); + + for (i = 0; i < i2c_next_dev; i++) { + struct i2c_adapter *adapter; + struct i2c_client *client; + + adapter = i2c_get_adapter(i2c_bus[i]); + if (adapter) { + client = i2c_new_device(adapter, i2c_devs[i]); + if (!client) + pr_err("can't create i2c device %s\n", + i2c_devs[i]->type); + } else + i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1); + } + intel_scu_notifier_post(SCU_AVAILABLE, NULL); +} +EXPORT_SYMBOL_GPL(intel_scu_devices_create); + +/* Called by IPC driver */ +void intel_scu_devices_destroy(void) +{ + int i; + + intel_scu_notifier_post(SCU_DOWN, NULL); + + for (i = 0; i < ipc_next_dev; i++) + platform_device_del(ipc_devs[i]); +} +EXPORT_SYMBOL_GPL(intel_scu_devices_destroy); + +static void __init install_irq_resource(struct platform_device *pdev, int irq) +{ + /* Single threaded */ + static struct resource res __initdata = { + .name = "IRQ", + .flags = IORESOURCE_IRQ, + }; + res.start = irq; + platform_device_add_resources(pdev, &res, 1); +} + +static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry, + struct devs_id *dev) +{ + struct platform_device *pdev; + void *pdata = NULL; + + pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n", + pentry->name, pentry->irq); + pdata = intel_mid_sfi_get_pdata(dev, pentry); + + pdev = platform_device_alloc(pentry->name, 0); + if (pdev == NULL) { + pr_err("out of memory for SFI platform device '%s'.\n", + pentry->name); + return; + } + install_irq_resource(pdev, pentry->irq); + + pdev->dev.platform_data = pdata; + platform_device_add(pdev); +} + +static void __init sfi_handle_spi_dev(struct sfi_device_table_entry *pentry, + struct devs_id *dev) +{ + struct spi_board_info spi_info; + void *pdata = NULL; + + memset(&spi_info, 0, sizeof(spi_info)); + strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN); + spi_info.irq = ((pentry->irq == (u8)0xff) ? 0 : pentry->irq); + spi_info.bus_num = pentry->host_num; + spi_info.chip_select = pentry->addr; + spi_info.max_speed_hz = pentry->max_freq; + pr_debug("SPI bus=%d, name=%16.16s, irq=0x%2x, max_freq=%d, cs=%d\n", + spi_info.bus_num, + spi_info.modalias, + spi_info.irq, + spi_info.max_speed_hz, + spi_info.chip_select); + + pdata = intel_mid_sfi_get_pdata(dev, &spi_info); + + spi_info.platform_data = pdata; + if (dev->delay) + intel_scu_spi_device_register(&spi_info); + else + spi_register_board_info(&spi_info, 1); +} + +static void __init sfi_handle_i2c_dev(struct sfi_device_table_entry *pentry, + struct devs_id *dev) +{ + struct i2c_board_info i2c_info; + void *pdata = NULL; + + memset(&i2c_info, 0, sizeof(i2c_info)); + strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN); + i2c_info.irq = ((pentry->irq == (u8)0xff) ? 0 : pentry->irq); + i2c_info.addr = pentry->addr; + pr_debug("I2C bus = %d, name = %16.16s, irq = 0x%2x, addr = 0x%x\n", + pentry->host_num, + i2c_info.type, + i2c_info.irq, + i2c_info.addr); + pdata = intel_mid_sfi_get_pdata(dev, &i2c_info); + i2c_info.platform_data = pdata; + + if (dev->delay) + intel_scu_i2c_device_register(pentry->host_num, &i2c_info); + else + i2c_register_board_info(pentry->host_num, &i2c_info, 1); +} + +extern struct devs_id *const __x86_intel_mid_dev_start[], + *const __x86_intel_mid_dev_end[]; + +static struct devs_id __init *get_device_id(u8 type, char *name) +{ + struct devs_id *const *dev_table; + + for (dev_table = __x86_intel_mid_dev_start; + dev_table < __x86_intel_mid_dev_end; dev_table++) { + struct devs_id *dev = *dev_table; + if (dev->type == type && + !strncmp(dev->name, name, SFI_NAME_LEN)) { + return dev; + } + } + + return NULL; +} + +static int __init sfi_parse_devs(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_device_table_entry *pentry; + struct devs_id *dev = NULL; + int num, i; + int ioapic; + struct io_apic_irq_attr irq_attr; + + sb = (struct sfi_table_simple *)table; + num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry); + pentry = (struct sfi_device_table_entry *)sb->pentry; + + for (i = 0; i < num; i++, pentry++) { + int irq = pentry->irq; + + if (irq != (u8)0xff) { /* native RTE case */ + /* these SPI2 devices are not exposed to system as PCI + * devices, but they have separate RTE entry in IOAPIC + * so we have to enable them one by one here + */ + ioapic = mp_find_ioapic(irq); + irq_attr.ioapic = ioapic; + irq_attr.ioapic_pin = irq; + irq_attr.trigger = 1; + irq_attr.polarity = 1; + io_apic_set_pci_routing(NULL, irq, &irq_attr); + } else + irq = 0; /* No irq */ + + dev = get_device_id(pentry->type, pentry->name); + + if (!dev) + continue; + + if (dev->device_handler) { + dev->device_handler(pentry, dev); + } else { + switch (pentry->type) { + case SFI_DEV_TYPE_IPC: + sfi_handle_ipc_dev(pentry, dev); + break; + case SFI_DEV_TYPE_SPI: + sfi_handle_spi_dev(pentry, dev); + break; + case SFI_DEV_TYPE_I2C: + sfi_handle_i2c_dev(pentry, dev); + break; + case SFI_DEV_TYPE_UART: + case SFI_DEV_TYPE_HSI: + default: + break; + } + } + } + return 0; +} + +static int __init intel_mid_platform_init(void) +{ + sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio); + sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs); + return 0; +} +arch_initcall(intel_mid_platform_init); diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile deleted file mode 100644 index af1da7e623f..00000000000 --- a/arch/x86/platform/mrst/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -obj-$(CONFIG_X86_INTEL_MID) += mrst.o -obj-$(CONFIG_X86_INTEL_MID) += vrtc.o -obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_mrst.o diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c deleted file mode 100644 index 47fe66fe61f..00000000000 --- a/arch/x86/platform/mrst/mrst.c +++ /dev/null @@ -1,1052 +0,0 @@ -/* - * mrst.c: Intel Moorestown platform specific setup code - * - * (C) Copyright 2008 Intel Corporation - * Author: Jacob Pan (jacob.jun.pan@intel.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#define pr_fmt(fmt) "mrst: " fmt - -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/scatterlist.h> -#include <linux/sfi.h> -#include <linux/intel_pmic_gpio.h> -#include <linux/spi/spi.h> -#include <linux/i2c.h> -#include <linux/i2c/pca953x.h> -#include <linux/gpio_keys.h> -#include <linux/input.h> -#include <linux/platform_device.h> -#include <linux/irq.h> -#include <linux/module.h> -#include <linux/notifier.h> -#include <linux/mfd/intel_msic.h> -#include <linux/gpio.h> -#include <linux/i2c/tc35876x.h> - -#include <asm/setup.h> -#include <asm/mpspec_def.h> -#include <asm/hw_irq.h> -#include <asm/apic.h> -#include <asm/io_apic.h> -#include <asm/mrst.h> -#include <asm/mrst-vrtc.h> -#include <asm/io.h> -#include <asm/i8259.h> -#include <asm/intel_scu_ipc.h> -#include <asm/apb_timer.h> -#include <asm/reboot.h> - -/* - * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, - * cmdline option x86_mrst_timer can be used to override the configuration - * to prefer one or the other. - * at runtime, there are basically three timer configurations: - * 1. per cpu apbt clock only - * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only - * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast. - * - * by default (without cmdline option), platform code first detects cpu type - * to see if we are on lincroft or penwell, then set up both lapic or apbt - * clocks accordingly. - * i.e. by default, medfield uses configuration #2, moorestown uses #1. - * config #3 is supported but not recommended on medfield. - * - * rating and feature summary: - * lapic (with C3STOP) --------- 100 - * apbt (always-on) ------------ 110 - * lapic (always-on,ARAT) ------ 150 - */ - -enum mrst_timer_options mrst_timer_options; - -static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; -static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; -enum mrst_cpu_type __mrst_cpu_chip; -EXPORT_SYMBOL_GPL(__mrst_cpu_chip); - -int sfi_mtimer_num; - -struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; -EXPORT_SYMBOL_GPL(sfi_mrtc_array); -int sfi_mrtc_num; - -static void mrst_power_off(void) -{ -} - -static void mrst_reboot(void) -{ - intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0); -} - -/* parse all the mtimer info to a static mtimer array */ -static int __init sfi_parse_mtmr(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_timer_table_entry *pentry; - struct mpc_intsrc mp_irq; - int totallen; - - sb = (struct sfi_table_simple *)table; - if (!sfi_mtimer_num) { - sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb, - struct sfi_timer_table_entry); - pentry = (struct sfi_timer_table_entry *) sb->pentry; - totallen = sfi_mtimer_num * sizeof(*pentry); - memcpy(sfi_mtimer_array, pentry, totallen); - } - - pr_debug("SFI MTIMER info (num = %d):\n", sfi_mtimer_num); - pentry = sfi_mtimer_array; - for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) { - pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz," - " irq = %d\n", totallen, (u32)pentry->phys_addr, - pentry->freq_hz, pentry->irq); - if (!pentry->irq) - continue; - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; -/* triggering mode edge bit 2-3, active high polarity bit 0-1 */ - mp_irq.irqflag = 5; - mp_irq.srcbus = MP_BUS_ISA; - mp_irq.srcbusirq = pentry->irq; /* IRQ */ - mp_irq.dstapic = MP_APIC_ALL; - mp_irq.dstirq = pentry->irq; - mp_save_irq(&mp_irq); - } - - return 0; -} - -struct sfi_timer_table_entry *sfi_get_mtmr(int hint) -{ - int i; - if (hint < sfi_mtimer_num) { - if (!sfi_mtimer_usage[hint]) { - pr_debug("hint taken for timer %d irq %d\n",\ - hint, sfi_mtimer_array[hint].irq); - sfi_mtimer_usage[hint] = 1; - return &sfi_mtimer_array[hint]; - } - } - /* take the first timer available */ - for (i = 0; i < sfi_mtimer_num;) { - if (!sfi_mtimer_usage[i]) { - sfi_mtimer_usage[i] = 1; - return &sfi_mtimer_array[i]; - } - i++; - } - return NULL; -} - -void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr) -{ - int i; - for (i = 0; i < sfi_mtimer_num;) { - if (mtmr->irq == sfi_mtimer_array[i].irq) { - sfi_mtimer_usage[i] = 0; - return; - } - i++; - } -} - -/* parse all the mrtc info to a global mrtc array */ -int __init sfi_parse_mrtc(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_rtc_table_entry *pentry; - struct mpc_intsrc mp_irq; - - int totallen; - - sb = (struct sfi_table_simple *)table; - if (!sfi_mrtc_num) { - sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb, - struct sfi_rtc_table_entry); - pentry = (struct sfi_rtc_table_entry *)sb->pentry; - totallen = sfi_mrtc_num * sizeof(*pentry); - memcpy(sfi_mrtc_array, pentry, totallen); - } - - pr_debug("SFI RTC info (num = %d):\n", sfi_mrtc_num); - pentry = sfi_mrtc_array; - for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { - pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n", - totallen, (u32)pentry->phys_addr, pentry->irq); - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = 0xf; /* level trigger and active low */ - mp_irq.srcbus = MP_BUS_ISA; - mp_irq.srcbusirq = pentry->irq; /* IRQ */ - mp_irq.dstapic = MP_APIC_ALL; - mp_irq.dstirq = pentry->irq; - mp_save_irq(&mp_irq); - } - return 0; -} - -static unsigned long __init mrst_calibrate_tsc(void) -{ - unsigned long fast_calibrate; - u32 lo, hi, ratio, fsb; - - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi); - ratio = (hi >> 8) & 0x1f; - pr_debug("ratio is %d\n", ratio); - if (!ratio) { - pr_err("read a zero ratio, should be incorrect!\n"); - pr_err("force tsc ratio to 16 ...\n"); - ratio = 16; - } - rdmsr(MSR_FSB_FREQ, lo, hi); - if ((lo & 0x7) == 0x7) - fsb = PENWELL_FSB_FREQ_83SKU; - else - fsb = PENWELL_FSB_FREQ_100SKU; - fast_calibrate = ratio * fsb; - pr_debug("read penwell tsc %lu khz\n", fast_calibrate); - lapic_timer_frequency = fsb * 1000 / HZ; - /* mark tsc clocksource as reliable */ - set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); - - if (fast_calibrate) - return fast_calibrate; - - return 0; -} - -static void __init mrst_time_init(void) -{ - sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); - switch (mrst_timer_options) { - case MRST_TIMER_APBT_ONLY: - break; - case MRST_TIMER_LAPIC_APBT: - x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; - x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; - break; - default: - if (!boot_cpu_has(X86_FEATURE_ARAT)) - break; - x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; - x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; - return; - } - /* we need at least one APB timer */ - pre_init_apic_IRQ0(); - apbt_time_init(); -} - -static void mrst_arch_setup(void) -{ - if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) - __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; - else { - pr_err("Unknown Intel MID CPU (%d:%d), default to Penwell\n", - boot_cpu_data.x86, boot_cpu_data.x86_model); - __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; - } -} - -/* MID systems don't have i8042 controller */ -static int mrst_i8042_detect(void) -{ - return 0; -} - -/* - * Moorestown does not have external NMI source nor port 0x61 to report - * NMI status. The possible NMI sources are from pmu as a result of NMI - * watchdog or lock debug. Reading io port 0x61 results in 0xff which - * misled NMI handler. - */ -static unsigned char mrst_get_nmi_reason(void) -{ - return 0; -} - -/* - * Moorestown specific x86_init function overrides and early setup - * calls. - */ -void __init x86_mrst_early_setup(void) -{ - x86_init.resources.probe_roms = x86_init_noop; - x86_init.resources.reserve_resources = x86_init_noop; - - x86_init.timers.timer_init = mrst_time_init; - x86_init.timers.setup_percpu_clockev = x86_init_noop; - - x86_init.irqs.pre_vector_init = x86_init_noop; - - x86_init.oem.arch_setup = mrst_arch_setup; - - x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; - - x86_platform.calibrate_tsc = mrst_calibrate_tsc; - x86_platform.i8042_detect = mrst_i8042_detect; - x86_init.timers.wallclock_init = mrst_rtc_init; - x86_platform.get_nmi_reason = mrst_get_nmi_reason; - - x86_init.pci.init = pci_mrst_init; - x86_init.pci.fixup_irqs = x86_init_noop; - - legacy_pic = &null_legacy_pic; - - /* Moorestown specific power_off/restart method */ - pm_power_off = mrst_power_off; - machine_ops.emergency_restart = mrst_reboot; - - /* Avoid searching for BIOS MP tables */ - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.get_smp_config = x86_init_uint_noop; - set_bit(MP_BUS_ISA, mp_bus_not_pci); -} - -/* - * if user does not want to use per CPU apb timer, just give it a lower rating - * than local apic timer and skip the late per cpu timer init. - */ -static inline int __init setup_x86_mrst_timer(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp("apbt_only", arg) == 0) - mrst_timer_options = MRST_TIMER_APBT_ONLY; - else if (strcmp("lapic_and_apbt", arg) == 0) - mrst_timer_options = MRST_TIMER_LAPIC_APBT; - else { - pr_warning("X86 MRST timer option %s not recognised" - " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", - arg); - return -EINVAL; - } - return 0; -} -__setup("x86_mrst_timer=", setup_x86_mrst_timer); - -/* - * Parsing GPIO table first, since the DEVS table will need this table - * to map the pin name to the actual pin. - */ -static struct sfi_gpio_table_entry *gpio_table; -static int gpio_num_entry; - -static int __init sfi_parse_gpio(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_gpio_table_entry *pentry; - int num, i; - - if (gpio_table) - return 0; - sb = (struct sfi_table_simple *)table; - num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry); - pentry = (struct sfi_gpio_table_entry *)sb->pentry; - - gpio_table = kmalloc(num * sizeof(*pentry), GFP_KERNEL); - if (!gpio_table) - return -1; - memcpy(gpio_table, pentry, num * sizeof(*pentry)); - gpio_num_entry = num; - - pr_debug("GPIO pin info:\n"); - for (i = 0; i < num; i++, pentry++) - pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s," - " pin = %d\n", i, - pentry->controller_name, - pentry->pin_name, - pentry->pin_no); - return 0; -} - -static int get_gpio_by_name(const char *name) -{ - struct sfi_gpio_table_entry *pentry = gpio_table; - int i; - - if (!pentry) - return -1; - for (i = 0; i < gpio_num_entry; i++, pentry++) { - if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN)) - return pentry->pin_no; - } - return -1; -} - -/* - * Here defines the array of devices platform data that IAFW would export - * through SFI "DEVS" table, we use name and type to match the device and - * its platform data. - */ -struct devs_id { - char name[SFI_NAME_LEN + 1]; - u8 type; - u8 delay; - void *(*get_platform_data)(void *info); -}; - -/* the offset for the mapping of global gpio pin to irq */ -#define MRST_IRQ_OFFSET 0x100 - -static void __init *pmic_gpio_platform_data(void *info) -{ - static struct intel_pmic_gpio_platform_data pmic_gpio_pdata; - int gpio_base = get_gpio_by_name("pmic_gpio_base"); - - if (gpio_base == -1) - gpio_base = 64; - pmic_gpio_pdata.gpio_base = gpio_base; - pmic_gpio_pdata.irq_base = gpio_base + MRST_IRQ_OFFSET; - pmic_gpio_pdata.gpiointr = 0xffffeff8; - - return &pmic_gpio_pdata; -} - -static void __init *max3111_platform_data(void *info) -{ - struct spi_board_info *spi_info = info; - int intr = get_gpio_by_name("max3111_int"); - - spi_info->mode = SPI_MODE_0; - if (intr == -1) - return NULL; - spi_info->irq = intr + MRST_IRQ_OFFSET; - return NULL; -} - -/* we have multiple max7315 on the board ... */ -#define MAX7315_NUM 2 -static void __init *max7315_platform_data(void *info) -{ - static struct pca953x_platform_data max7315_pdata[MAX7315_NUM]; - static int nr; - struct pca953x_platform_data *max7315 = &max7315_pdata[nr]; - struct i2c_board_info *i2c_info = info; - int gpio_base, intr; - char base_pin_name[SFI_NAME_LEN + 1]; - char intr_pin_name[SFI_NAME_LEN + 1]; - - if (nr == MAX7315_NUM) { - pr_err("too many max7315s, we only support %d\n", - MAX7315_NUM); - return NULL; - } - /* we have several max7315 on the board, we only need load several - * instances of the same pca953x driver to cover them - */ - strcpy(i2c_info->type, "max7315"); - if (nr++) { - sprintf(base_pin_name, "max7315_%d_base", nr); - sprintf(intr_pin_name, "max7315_%d_int", nr); - } else { - strcpy(base_pin_name, "max7315_base"); - strcpy(intr_pin_name, "max7315_int"); - } - - gpio_base = get_gpio_by_name(base_pin_name); - intr = get_gpio_by_name(intr_pin_name); - - if (gpio_base == -1) - return NULL; - max7315->gpio_base = gpio_base; - if (intr != -1) { - i2c_info->irq = intr + MRST_IRQ_OFFSET; - max7315->irq_base = gpio_base + MRST_IRQ_OFFSET; - } else { - i2c_info->irq = -1; - max7315->irq_base = -1; - } - return max7315; -} - -static void *tca6416_platform_data(void *info) -{ - static struct pca953x_platform_data tca6416; - struct i2c_board_info *i2c_info = info; - int gpio_base, intr; - char base_pin_name[SFI_NAME_LEN + 1]; - char intr_pin_name[SFI_NAME_LEN + 1]; - - strcpy(i2c_info->type, "tca6416"); - strcpy(base_pin_name, "tca6416_base"); - strcpy(intr_pin_name, "tca6416_int"); - - gpio_base = get_gpio_by_name(base_pin_name); - intr = get_gpio_by_name(intr_pin_name); - - if (gpio_base == -1) - return NULL; - tca6416.gpio_base = gpio_base; - if (intr != -1) { - i2c_info->irq = intr + MRST_IRQ_OFFSET; - tca6416.irq_base = gpio_base + MRST_IRQ_OFFSET; - } else { - i2c_info->irq = -1; - tca6416.irq_base = -1; - } - return &tca6416; -} - -static void *mpu3050_platform_data(void *info) -{ - struct i2c_board_info *i2c_info = info; - int intr = get_gpio_by_name("mpu3050_int"); - - if (intr == -1) - return NULL; - - i2c_info->irq = intr + MRST_IRQ_OFFSET; - return NULL; -} - -static void __init *emc1403_platform_data(void *info) -{ - static short intr2nd_pdata; - struct i2c_board_info *i2c_info = info; - int intr = get_gpio_by_name("thermal_int"); - int intr2nd = get_gpio_by_name("thermal_alert"); - - if (intr == -1 || intr2nd == -1) - return NULL; - - i2c_info->irq = intr + MRST_IRQ_OFFSET; - intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET; - - return &intr2nd_pdata; -} - -static void __init *lis331dl_platform_data(void *info) -{ - static short intr2nd_pdata; - struct i2c_board_info *i2c_info = info; - int intr = get_gpio_by_name("accel_int"); - int intr2nd = get_gpio_by_name("accel_2"); - - if (intr == -1 || intr2nd == -1) - return NULL; - - i2c_info->irq = intr + MRST_IRQ_OFFSET; - intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET; - - return &intr2nd_pdata; -} - -static void __init *no_platform_data(void *info) -{ - return NULL; -} - -static struct resource msic_resources[] = { - { - .start = INTEL_MSIC_IRQ_PHYS_BASE, - .end = INTEL_MSIC_IRQ_PHYS_BASE + 64 - 1, - .flags = IORESOURCE_MEM, - }, -}; - -static struct intel_msic_platform_data msic_pdata; - -static struct platform_device msic_device = { - .name = "intel_msic", - .id = -1, - .dev = { - .platform_data = &msic_pdata, - }, - .num_resources = ARRAY_SIZE(msic_resources), - .resource = msic_resources, -}; - -static inline bool mrst_has_msic(void) -{ - return mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL; -} - -static int msic_scu_status_change(struct notifier_block *nb, - unsigned long code, void *data) -{ - if (code == SCU_DOWN) { - platform_device_unregister(&msic_device); - return 0; - } - - return platform_device_register(&msic_device); -} - -static int __init msic_init(void) -{ - static struct notifier_block msic_scu_notifier = { - .notifier_call = msic_scu_status_change, - }; - - /* - * We need to be sure that the SCU IPC is ready before MSIC device - * can be registered. - */ - if (mrst_has_msic()) - intel_scu_notifier_add(&msic_scu_notifier); - - return 0; -} -arch_initcall(msic_init); - -/* - * msic_generic_platform_data - sets generic platform data for the block - * @info: pointer to the SFI device table entry for this block - * @block: MSIC block - * - * Function sets IRQ number from the SFI table entry for given device to - * the MSIC platform data. - */ -static void *msic_generic_platform_data(void *info, enum intel_msic_block block) -{ - struct sfi_device_table_entry *entry = info; - - BUG_ON(block < 0 || block >= INTEL_MSIC_BLOCK_LAST); - msic_pdata.irq[block] = entry->irq; - - return no_platform_data(info); -} - -static void *msic_battery_platform_data(void *info) -{ - return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_BATTERY); -} - -static void *msic_gpio_platform_data(void *info) -{ - static struct intel_msic_gpio_pdata pdata; - int gpio = get_gpio_by_name("msic_gpio_base"); - - if (gpio < 0) - return NULL; - - pdata.gpio_base = gpio; - msic_pdata.gpio = &pdata; - - return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_GPIO); -} - -static void *msic_audio_platform_data(void *info) -{ - struct platform_device *pdev; - - pdev = platform_device_register_simple("sst-platform", -1, NULL, 0); - if (IS_ERR(pdev)) { - pr_err("failed to create audio platform device\n"); - return NULL; - } - - return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_AUDIO); -} - -static void *msic_power_btn_platform_data(void *info) -{ - return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_POWER_BTN); -} - -static void *msic_ocd_platform_data(void *info) -{ - static struct intel_msic_ocd_pdata pdata; - int gpio = get_gpio_by_name("ocd_gpio"); - - if (gpio < 0) - return NULL; - - pdata.gpio = gpio; - msic_pdata.ocd = &pdata; - - return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_OCD); -} - -static void *msic_thermal_platform_data(void *info) -{ - return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_THERMAL); -} - -/* tc35876x DSI-LVDS bridge chip and panel platform data */ -static void *tc35876x_platform_data(void *data) -{ - static struct tc35876x_platform_data pdata; - - /* gpio pins set to -1 will not be used by the driver */ - pdata.gpio_bridge_reset = get_gpio_by_name("LCMB_RXEN"); - pdata.gpio_panel_bl_en = get_gpio_by_name("6S6P_BL_EN"); - pdata.gpio_panel_vadd = get_gpio_by_name("EN_VREG_LCD_V3P3"); - - return &pdata; -} - -static const struct devs_id __initconst device_ids[] = { - {"bma023", SFI_DEV_TYPE_I2C, 1, &no_platform_data}, - {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data}, - {"pmic_gpio", SFI_DEV_TYPE_IPC, 1, &pmic_gpio_platform_data}, - {"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data}, - {"i2c_max7315", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data}, - {"i2c_max7315_2", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data}, - {"tca6416", SFI_DEV_TYPE_I2C, 1, &tca6416_platform_data}, - {"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data}, - {"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data}, - {"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data}, - {"mpu3050", SFI_DEV_TYPE_I2C, 1, &mpu3050_platform_data}, - {"i2c_disp_brig", SFI_DEV_TYPE_I2C, 0, &tc35876x_platform_data}, - - /* MSIC subdevices */ - {"msic_battery", SFI_DEV_TYPE_IPC, 1, &msic_battery_platform_data}, - {"msic_gpio", SFI_DEV_TYPE_IPC, 1, &msic_gpio_platform_data}, - {"msic_audio", SFI_DEV_TYPE_IPC, 1, &msic_audio_platform_data}, - {"msic_power_btn", SFI_DEV_TYPE_IPC, 1, &msic_power_btn_platform_data}, - {"msic_ocd", SFI_DEV_TYPE_IPC, 1, &msic_ocd_platform_data}, - {"msic_thermal", SFI_DEV_TYPE_IPC, 1, &msic_thermal_platform_data}, - - {}, -}; - -#define MAX_IPCDEVS 24 -static struct platform_device *ipc_devs[MAX_IPCDEVS]; -static int ipc_next_dev; - -#define MAX_SCU_SPI 24 -static struct spi_board_info *spi_devs[MAX_SCU_SPI]; -static int spi_next_dev; - -#define MAX_SCU_I2C 24 -static struct i2c_board_info *i2c_devs[MAX_SCU_I2C]; -static int i2c_bus[MAX_SCU_I2C]; -static int i2c_next_dev; - -static void __init intel_scu_device_register(struct platform_device *pdev) -{ - if(ipc_next_dev == MAX_IPCDEVS) - pr_err("too many SCU IPC devices"); - else - ipc_devs[ipc_next_dev++] = pdev; -} - -static void __init intel_scu_spi_device_register(struct spi_board_info *sdev) -{ - struct spi_board_info *new_dev; - - if (spi_next_dev == MAX_SCU_SPI) { - pr_err("too many SCU SPI devices"); - return; - } - - new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL); - if (!new_dev) { - pr_err("failed to alloc mem for delayed spi dev %s\n", - sdev->modalias); - return; - } - memcpy(new_dev, sdev, sizeof(*sdev)); - - spi_devs[spi_next_dev++] = new_dev; -} - -static void __init intel_scu_i2c_device_register(int bus, - struct i2c_board_info *idev) -{ - struct i2c_board_info *new_dev; - - if (i2c_next_dev == MAX_SCU_I2C) { - pr_err("too many SCU I2C devices"); - return; - } - - new_dev = kzalloc(sizeof(*idev), GFP_KERNEL); - if (!new_dev) { - pr_err("failed to alloc mem for delayed i2c dev %s\n", - idev->type); - return; - } - memcpy(new_dev, idev, sizeof(*idev)); - - i2c_bus[i2c_next_dev] = bus; - i2c_devs[i2c_next_dev++] = new_dev; -} - -BLOCKING_NOTIFIER_HEAD(intel_scu_notifier); -EXPORT_SYMBOL_GPL(intel_scu_notifier); - -/* Called by IPC driver */ -void intel_scu_devices_create(void) -{ - int i; - - for (i = 0; i < ipc_next_dev; i++) - platform_device_add(ipc_devs[i]); - - for (i = 0; i < spi_next_dev; i++) - spi_register_board_info(spi_devs[i], 1); - - for (i = 0; i < i2c_next_dev; i++) { - struct i2c_adapter *adapter; - struct i2c_client *client; - - adapter = i2c_get_adapter(i2c_bus[i]); - if (adapter) { - client = i2c_new_device(adapter, i2c_devs[i]); - if (!client) - pr_err("can't create i2c device %s\n", - i2c_devs[i]->type); - } else - i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1); - } - intel_scu_notifier_post(SCU_AVAILABLE, NULL); -} -EXPORT_SYMBOL_GPL(intel_scu_devices_create); - -/* Called by IPC driver */ -void intel_scu_devices_destroy(void) -{ - int i; - - intel_scu_notifier_post(SCU_DOWN, NULL); - - for (i = 0; i < ipc_next_dev; i++) - platform_device_del(ipc_devs[i]); -} -EXPORT_SYMBOL_GPL(intel_scu_devices_destroy); - -static void __init install_irq_resource(struct platform_device *pdev, int irq) -{ - /* Single threaded */ - static struct resource __initdata res = { - .name = "IRQ", - .flags = IORESOURCE_IRQ, - }; - res.start = irq; - platform_device_add_resources(pdev, &res, 1); -} - -static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *entry) -{ - const struct devs_id *dev = device_ids; - struct platform_device *pdev; - void *pdata = NULL; - - while (dev->name[0]) { - if (dev->type == SFI_DEV_TYPE_IPC && - !strncmp(dev->name, entry->name, SFI_NAME_LEN)) { - pdata = dev->get_platform_data(entry); - break; - } - dev++; - } - - /* - * On Medfield the platform device creation is handled by the MSIC - * MFD driver so we don't need to do it here. - */ - if (mrst_has_msic()) - return; - - pdev = platform_device_alloc(entry->name, 0); - if (pdev == NULL) { - pr_err("out of memory for SFI platform device '%s'.\n", - entry->name); - return; - } - install_irq_resource(pdev, entry->irq); - - pdev->dev.platform_data = pdata; - intel_scu_device_register(pdev); -} - -static void __init sfi_handle_spi_dev(struct spi_board_info *spi_info) -{ - const struct devs_id *dev = device_ids; - void *pdata = NULL; - - while (dev->name[0]) { - if (dev->type == SFI_DEV_TYPE_SPI && - !strncmp(dev->name, spi_info->modalias, SFI_NAME_LEN)) { - pdata = dev->get_platform_data(spi_info); - break; - } - dev++; - } - spi_info->platform_data = pdata; - if (dev->delay) - intel_scu_spi_device_register(spi_info); - else - spi_register_board_info(spi_info, 1); -} - -static void __init sfi_handle_i2c_dev(int bus, struct i2c_board_info *i2c_info) -{ - const struct devs_id *dev = device_ids; - void *pdata = NULL; - - while (dev->name[0]) { - if (dev->type == SFI_DEV_TYPE_I2C && - !strncmp(dev->name, i2c_info->type, SFI_NAME_LEN)) { - pdata = dev->get_platform_data(i2c_info); - break; - } - dev++; - } - i2c_info->platform_data = pdata; - - if (dev->delay) - intel_scu_i2c_device_register(bus, i2c_info); - else - i2c_register_board_info(bus, i2c_info, 1); - } - - -static int __init sfi_parse_devs(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_device_table_entry *pentry; - struct spi_board_info spi_info; - struct i2c_board_info i2c_info; - int num, i, bus; - int ioapic; - struct io_apic_irq_attr irq_attr; - - sb = (struct sfi_table_simple *)table; - num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry); - pentry = (struct sfi_device_table_entry *)sb->pentry; - - for (i = 0; i < num; i++, pentry++) { - int irq = pentry->irq; - - if (irq != (u8)0xff) { /* native RTE case */ - /* these SPI2 devices are not exposed to system as PCI - * devices, but they have separate RTE entry in IOAPIC - * so we have to enable them one by one here - */ - ioapic = mp_find_ioapic(irq); - irq_attr.ioapic = ioapic; - irq_attr.ioapic_pin = irq; - irq_attr.trigger = 1; - irq_attr.polarity = 1; - io_apic_set_pci_routing(NULL, irq, &irq_attr); - } else - irq = 0; /* No irq */ - - switch (pentry->type) { - case SFI_DEV_TYPE_IPC: - pr_debug("info[%2d]: IPC bus, name = %16.16s, " - "irq = 0x%2x\n", i, pentry->name, pentry->irq); - sfi_handle_ipc_dev(pentry); - break; - case SFI_DEV_TYPE_SPI: - memset(&spi_info, 0, sizeof(spi_info)); - strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN); - spi_info.irq = irq; - spi_info.bus_num = pentry->host_num; - spi_info.chip_select = pentry->addr; - spi_info.max_speed_hz = pentry->max_freq; - pr_debug("info[%2d]: SPI bus = %d, name = %16.16s, " - "irq = 0x%2x, max_freq = %d, cs = %d\n", i, - spi_info.bus_num, - spi_info.modalias, - spi_info.irq, - spi_info.max_speed_hz, - spi_info.chip_select); - sfi_handle_spi_dev(&spi_info); - break; - case SFI_DEV_TYPE_I2C: - memset(&i2c_info, 0, sizeof(i2c_info)); - bus = pentry->host_num; - strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN); - i2c_info.irq = irq; - i2c_info.addr = pentry->addr; - pr_debug("info[%2d]: I2C bus = %d, name = %16.16s, " - "irq = 0x%2x, addr = 0x%x\n", i, bus, - i2c_info.type, - i2c_info.irq, - i2c_info.addr); - sfi_handle_i2c_dev(bus, &i2c_info); - break; - case SFI_DEV_TYPE_UART: - case SFI_DEV_TYPE_HSI: - default: - ; - } - } - return 0; -} - -static int __init mrst_platform_init(void) -{ - sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio); - sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs); - return 0; -} -arch_initcall(mrst_platform_init); - -/* - * we will search these buttons in SFI GPIO table (by name) - * and register them dynamically. Please add all possible - * buttons here, we will shrink them if no GPIO found. - */ -static struct gpio_keys_button gpio_button[] = { - {KEY_POWER, -1, 1, "power_btn", EV_KEY, 0, 3000}, - {KEY_PROG1, -1, 1, "prog_btn1", EV_KEY, 0, 20}, - {KEY_PROG2, -1, 1, "prog_btn2", EV_KEY, 0, 20}, - {SW_LID, -1, 1, "lid_switch", EV_SW, 0, 20}, - {KEY_VOLUMEUP, -1, 1, "vol_up", EV_KEY, 0, 20}, - {KEY_VOLUMEDOWN, -1, 1, "vol_down", EV_KEY, 0, 20}, - {KEY_CAMERA, -1, 1, "camera_full", EV_KEY, 0, 20}, - {KEY_CAMERA_FOCUS, -1, 1, "camera_half", EV_KEY, 0, 20}, - {SW_KEYPAD_SLIDE, -1, 1, "MagSw1", EV_SW, 0, 20}, - {SW_KEYPAD_SLIDE, -1, 1, "MagSw2", EV_SW, 0, 20}, -}; - -static struct gpio_keys_platform_data mrst_gpio_keys = { - .buttons = gpio_button, - .rep = 1, - .nbuttons = -1, /* will fill it after search */ -}; - -static struct platform_device pb_device = { - .name = "gpio-keys", - .id = -1, - .dev = { - .platform_data = &mrst_gpio_keys, - }, -}; - -/* - * Shrink the non-existent buttons, register the gpio button - * device if there is some - */ -static int __init pb_keys_init(void) -{ - struct gpio_keys_button *gb = gpio_button; - int i, num, good = 0; - - num = sizeof(gpio_button) / sizeof(struct gpio_keys_button); - for (i = 0; i < num; i++) { - gb[i].gpio = get_gpio_by_name(gb[i].desc); - pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, gb[i].gpio); - if (gb[i].gpio == -1) - continue; - - if (i != good) - gb[good] = gb[i]; - good++; - } - - if (good) { - mrst_gpio_keys.nbuttons = good; - return platform_device_register(&pb_device); - } - return 0; -} -late_initcall(pb_keys_init); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 1cf5b300305..424f4c97a44 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -25,10 +25,10 @@ #include <asm/cpu.h> #ifdef CONFIG_X86_32 -unsigned long saved_context_ebx; -unsigned long saved_context_esp, saved_context_ebp; -unsigned long saved_context_esi, saved_context_edi; -unsigned long saved_context_eflags; +__visible unsigned long saved_context_ebx; +__visible unsigned long saved_context_esp, saved_context_ebp; +__visible unsigned long saved_context_esi, saved_context_edi; +__visible unsigned long saved_context_eflags; #endif struct saved_context saved_context; diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index a0fde91c16c..304fca20d96 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -20,26 +20,26 @@ #include <asm/suspend.h> /* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; +extern __visible const void __nosave_begin, __nosave_end; /* Defined in hibernate_asm_64.S */ -extern int restore_image(void); +extern asmlinkage int restore_image(void); /* * Address to jump to in the last phase of restore in order to get to the image * kernel's text (this value is passed in the image header). */ -unsigned long restore_jump_address; +unsigned long restore_jump_address __visible; /* * Value of the cr3 register from before the hibernation (this value is passed * in the image header). */ -unsigned long restore_cr3; +unsigned long restore_cr3 __visible; -pgd_t *temp_level4_pgt; +pgd_t *temp_level4_pgt __visible; -void *relocated_restore_code; +void *relocated_restore_code __visible; static void *alloc_pgt_page(void *context) { diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index e6773dc8ac4..093a892026f 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -68,7 +68,7 @@ BEGIN { lprefix1_expr = "\\((66|!F3)\\)" lprefix2_expr = "\\(F3\\)" - lprefix3_expr = "\\((F2|!F3)\\)" + lprefix3_expr = "\\((F2|!F3|66\\&F2)\\)" lprefix_expr = "\\((66|F2|F3)\\)" max_lprefix = 4 @@ -83,6 +83,8 @@ BEGIN { prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" prefix_num["REPNE"] = "INAT_PFX_REPNE" prefix_num["REP/REPE"] = "INAT_PFX_REPE" + prefix_num["XACQUIRE"] = "INAT_PFX_REPNE" + prefix_num["XRELEASE"] = "INAT_PFX_REPE" prefix_num["LOCK"] = "INAT_PFX_LOCK" prefix_num["SEG=CS"] = "INAT_PFX_CS" prefix_num["SEG=DS"] = "INAT_PFX_DS" diff --git a/arch/x86/um/os-Linux/prctl.c b/arch/x86/um/os-Linux/prctl.c index 9d34eddb517..96eb2bd2883 100644 --- a/arch/x86/um/os-Linux/prctl.c +++ b/arch/x86/um/os-Linux/prctl.c @@ -4,7 +4,7 @@ */ #include <sys/ptrace.h> -#include <linux/ptrace.h> +#include <asm/ptrace.h> int os_arch_prctl(int pid, int code, unsigned long *addr) { diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index c74436e687b..72074d52840 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -85,15 +85,18 @@ static notrace cycle_t vread_pvclock(int *mode) cycle_t ret; u64 last; u32 version; - u32 migrate_count; u8 flags; unsigned cpu, cpu1; /* - * When looping to get a consistent (time-info, tsc) pair, we - * also need to deal with the possibility we can switch vcpus, - * so make sure we always re-fetch time-info for the current vcpu. + * Note: hypervisor must guarantee that: + * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. + * 2. that per-CPU pvclock time info is updated if the + * underlying CPU changes. + * 3. that version is increased whenever underlying CPU + * changes. + * */ do { cpu = __getcpu() & VGETCPU_CPU_MASK; @@ -104,8 +107,6 @@ static notrace cycle_t vread_pvclock(int *mode) pvti = get_pvti(cpu); - migrate_count = pvti->migrate_count; - version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); /* @@ -117,8 +118,7 @@ static notrace cycle_t vread_pvclock(int *mode) cpu1 = __getcpu() & VGETCPU_CPU_MASK; } while (unlikely(cpu != cpu1 || (pvti->pvti.version & 1) || - pvti->pvti.version != version || - pvti->migrate_count != migrate_count)); + pvti->pvti.version != version)); if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) *mode = VCLOCK_NONE; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 193097ef3d7..fa6ade76ef3 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -427,8 +427,7 @@ static void __init xen_init_cpuid_mask(void) if (!xen_initial_domain()) cpuid_leaf1_edx_mask &= - ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ - (1 << X86_FEATURE_ACPI)); /* disable ACPI */ + ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */ cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32)); @@ -735,8 +734,7 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, addr = (unsigned long)xen_int3; else if (addr == (unsigned long)stack_segment) addr = (unsigned long)xen_stack_segment; - else if (addr == (unsigned long)double_fault || - addr == (unsigned long)nmi) { + else if (addr == (unsigned long)double_fault) { /* Don't need to handle these */ return 0; #ifdef CONFIG_X86_MCE @@ -747,7 +745,12 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, */ ; #endif - } else { + } else if (addr == (unsigned long)nmi) + /* + * Use the native version as well. + */ + ; + else { /* Some other trap using IST? */ if (WARN_ON(val->ist != 0)) return 0; @@ -1689,7 +1692,6 @@ static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action, case CPU_UP_PREPARE: xen_vcpu_setup(cpu); if (xen_have_vector_callback) { - xen_init_lock_cpu(cpu); if (xen_feature(XENFEAT_hvm_safe_pvclock)) xen_setup_timer(cpu); } @@ -1710,6 +1712,8 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_shared_info(); + xen_panic_handler_init(); + if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; xen_hvm_smp_init(); @@ -1720,15 +1724,12 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_mmu_ops(); } -static bool __init xen_hvm_platform(void) +static uint32_t __init xen_hvm_platform(void) { if (xen_pv_domain()) - return false; - - if (!xen_cpuid_base()) - return false; + return 0; - return true; + return xen_cpuid_base(); } bool xen_hvm_need_lapic(void) diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 01a4dc015ae..0da7f863056 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -47,23 +47,18 @@ static void xen_restore_fl(unsigned long flags) /* convert from IF type flag */ flags = !(flags & X86_EFLAGS_IF); - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ + /* See xen_irq_enable() for why preemption must be disabled. */ preempt_disable(); vcpu = this_cpu_read(xen_vcpu); vcpu->evtchn_upcall_mask = flags; - preempt_enable_no_resched(); - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ if (flags == 0) { - preempt_check_resched(); barrier(); /* unmask then check (avoid races) */ if (unlikely(vcpu->evtchn_upcall_pending)) xen_force_evtchn_callback(); - } + preempt_enable(); + } else + preempt_enable_no_resched(); } PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); @@ -82,10 +77,12 @@ static void xen_irq_enable(void) { struct vcpu_info *vcpu; - /* We don't need to worry about being preempted here, since - either a) interrupts are disabled, so no preemption, or b) - the caller is confused and is trying to re-enable interrupts - on an indeterminate processor. */ + /* + * We may be preempted as soon as vcpu->evtchn_upcall_mask is + * cleared, so disable preemption to ensure we check for + * events on the VCPU we are still running on. + */ + preempt_disable(); vcpu = this_cpu_read(xen_vcpu); vcpu->evtchn_upcall_mask = 0; @@ -96,6 +93,8 @@ static void xen_irq_enable(void) barrier(); /* unmask then check (avoid races) */ if (unlikely(vcpu->evtchn_upcall_pending)) xen_force_evtchn_callback(); + + preempt_enable(); } PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 95fb2aa5927..a61c7d5811b 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -161,6 +161,7 @@ #include <asm/xen/page.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> +#include <xen/balloon.h> #include <xen/grant_table.h> #include "multicalls.h" @@ -878,7 +879,6 @@ int m2p_add_override(unsigned long mfn, struct page *page, unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; - int ret = 0; pfn = page_to_pfn(page); if (!PageHighMem(page)) { @@ -925,8 +925,8 @@ int m2p_add_override(unsigned long mfn, struct page *page, * frontend pages while they are being shared with the backend, * because mfn_to_pfn (that ends up being called by GUPF) will * return the backend pfn rather than the frontend pfn. */ - ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); - if (ret == 0 && get_phys_to_machine(pfn) == mfn) + pfn = mfn_to_pfn_no_overrides(mfn); + if (get_phys_to_machine(pfn) == mfn) set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); return 0; @@ -941,7 +941,6 @@ int m2p_remove_override(struct page *page, unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; - int ret = 0; pfn = page_to_pfn(page); mfn = get_phys_to_machine(pfn); @@ -967,7 +966,10 @@ int m2p_remove_override(struct page *page, if (kmap_op != NULL) { if (!PageHighMem(page)) { struct multicall_space mcs; - struct gnttab_unmap_grant_ref *unmap_op; + struct gnttab_unmap_and_replace *unmap_op; + struct page *scratch_page = get_balloon_scratch_page(); + unsigned long scratch_page_address = (unsigned long) + __va(page_to_pfn(scratch_page) << PAGE_SHIFT); /* * It might be that we queued all the m2p grant table @@ -986,25 +988,31 @@ int m2p_remove_override(struct page *page, printk(KERN_WARNING "m2p_remove_override: " "pfn %lx mfn %lx, failed to modify kernel mappings", pfn, mfn); + put_balloon_scratch_page(); return -1; } - mcs = xen_mc_entry( - sizeof(struct gnttab_unmap_grant_ref)); + xen_mc_batch(); + + mcs = __xen_mc_entry( + sizeof(struct gnttab_unmap_and_replace)); unmap_op = mcs.args; unmap_op->host_addr = kmap_op->host_addr; + unmap_op->new_addr = scratch_page_address; unmap_op->handle = kmap_op->handle; - unmap_op->dev_bus_addr = 0; MULTI_grant_table_op(mcs.mc, - GNTTABOP_unmap_grant_ref, unmap_op, 1); + GNTTABOP_unmap_and_replace, unmap_op, 1); + + mcs = __xen_mc_entry(0); + MULTI_update_va_mapping(mcs.mc, scratch_page_address, + pfn_pte(page_to_pfn(scratch_page), + PAGE_KERNEL_RO), 0); xen_mc_issue(PARAVIRT_LAZY_MMU); - set_pte_at(&init_mm, address, ptep, - pfn_pte(pfn, PAGE_KERNEL)); - __flush_tlb_single(address); kmap_op->host_addr = 0; + put_balloon_scratch_page(); } } @@ -1019,8 +1027,8 @@ int m2p_remove_override(struct page *page, * the original pfn causes mfn_to_pfn(mfn) to return the frontend * pfn again. */ mfn &= ~FOREIGN_FRAME_BIT; - ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); - if (ret == 0 && get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) && + pfn = mfn_to_pfn_no_overrides(mfn); + if (get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) && m2p_find_override(mfn) == NULL) set_phys_to_machine(pfn, mfn); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 8f3eea6b80c..09f3059cb00 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -33,6 +33,9 @@ /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; +#ifdef CONFIG_X86_64 +extern const char nmi[]; +#endif extern void xen_sysenter_target(void); extern void xen_syscall_target(void); extern void xen_syscall32_target(void); @@ -215,13 +218,19 @@ static void __init xen_set_identity_and_release_chunk( unsigned long pfn; /* - * If the PFNs are currently mapped, the VA mapping also needs - * to be updated to be 1:1. + * If the PFNs are currently mapped, clear the mappings + * (except for the ISA region which must be 1:1 mapped) to + * release the refcounts (in Xen) on the original frames. */ - for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) + for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { + pte_t pte = __pte_ma(0); + + if (pfn < PFN_UP(ISA_END_ADDRESS)) + pte = mfn_pte(pfn, PAGE_KERNEL_IO); + (void)HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - mfn_pte(pfn, PAGE_KERNEL_IO), 0); + (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0); + } if (start_pfn < nr_pages) *released += xen_release_chunk( @@ -547,7 +556,13 @@ void xen_enable_syscall(void) } #endif /* CONFIG_X86_64 */ } - +void __cpuinit xen_enable_nmi(void) +{ +#ifdef CONFIG_X86_64 + if (register_callback(CALLBACKTYPE_nmi, nmi)) + BUG(); +#endif +} void __init xen_arch_setup(void) { xen_panic_handler_init(); @@ -565,7 +580,7 @@ void __init xen_arch_setup(void) xen_enable_sysenter(); xen_enable_syscall(); - + xen_enable_nmi(); #ifdef CONFIG_ACPI if (!(xen_start_info->flags & SIF_INITDOMAIN)) { printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index b81c88e51da..31d04758b76 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -273,12 +273,30 @@ static void __init xen_smp_prepare_boot_cpu(void) BUG_ON(smp_processor_id() != 0); native_smp_prepare_boot_cpu(); - /* We've switched to the "real" per-cpu gdt, so make sure the - old memory can be recycled */ - make_lowmem_page_readwrite(xen_initial_gdt); + if (xen_pv_domain()) { + /* We've switched to the "real" per-cpu gdt, so make sure the + old memory can be recycled */ + make_lowmem_page_readwrite(xen_initial_gdt); - xen_filter_cpu_maps(); - xen_setup_vcpu_info_placement(); +#ifdef CONFIG_X86_32 + /* + * Xen starts us with XEN_FLAT_RING1_DS, but linux code + * expects __USER_DS + */ + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#endif + + xen_filter_cpu_maps(); + xen_setup_vcpu_info_placement(); + } + /* + * The alternative logic (which patches the unlock/lock) runs before + * the smp bootup up code is activated. Hence we need to set this up + * the core kernel is being patched. Otherwise we will have only + * modules patched but not core code. + */ + xen_init_spinlocks(); } static void __init xen_smp_prepare_cpus(unsigned int max_cpus) @@ -572,6 +590,12 @@ static inline int xen_map_vector(int vector) case IRQ_WORK_VECTOR: xen_vector = XEN_IRQ_WORK_VECTOR; break; +#ifdef CONFIG_X86_64 + case NMI_VECTOR: + case APIC_DM_NMI: /* Some use that instead of NMI_VECTOR */ + xen_vector = XEN_NMI_VECTOR; + break; +#endif default: xen_vector = -1; printk(KERN_ERR "xen: vector 0x%x is not implemented\n", @@ -680,7 +704,6 @@ void __init xen_smp_init(void) { smp_ops = xen_smp_ops; xen_fill_possible_map(); - xen_init_spinlocks(); } static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) @@ -703,6 +726,15 @@ static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) WARN_ON(rc); if (!rc) rc = native_cpu_up(cpu, tidle); + + /* + * We must initialize the slowpath CPU kicker _after_ the native + * path has executed. If we initialized it before none of the + * unlocker IPI kicks would reach the booting CPU as the booting + * CPU had not set itself 'online' in cpu_online_mask. That mask + * is checked when IPIs are sent (on HVM at least). + */ + xen_init_lock_cpu(cpu); return rc; } @@ -722,4 +754,5 @@ void __init xen_hvm_smp_init(void) smp_ops.cpu_die = xen_hvm_cpu_die; smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; + smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu; } diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index cf3caee356b..be6b8607895 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -17,45 +17,44 @@ #include "xen-ops.h" #include "debugfs.h" -#ifdef CONFIG_XEN_DEBUG_FS -static struct xen_spinlock_stats -{ - u64 taken; - u32 taken_slow; - u32 taken_slow_nested; - u32 taken_slow_pickup; - u32 taken_slow_spurious; - u32 taken_slow_irqenable; +enum xen_contention_stat { + TAKEN_SLOW, + TAKEN_SLOW_PICKUP, + TAKEN_SLOW_SPURIOUS, + RELEASED_SLOW, + RELEASED_SLOW_KICKED, + NR_CONTENTION_STATS +}; - u64 released; - u32 released_slow; - u32 released_slow_kicked; +#ifdef CONFIG_XEN_DEBUG_FS #define HISTO_BUCKETS 30 - u32 histo_spin_total[HISTO_BUCKETS+1]; - u32 histo_spin_spinning[HISTO_BUCKETS+1]; +static struct xen_spinlock_stats +{ + u32 contention_stats[NR_CONTENTION_STATS]; u32 histo_spin_blocked[HISTO_BUCKETS+1]; - - u64 time_total; - u64 time_spinning; u64 time_blocked; } spinlock_stats; static u8 zero_stats; -static unsigned lock_timeout = 1 << 10; -#define TIMEOUT lock_timeout - static inline void check_zero(void) { - if (unlikely(zero_stats)) { - memset(&spinlock_stats, 0, sizeof(spinlock_stats)); - zero_stats = 0; + u8 ret; + u8 old = ACCESS_ONCE(zero_stats); + if (unlikely(old)) { + ret = cmpxchg(&zero_stats, old, 0); + /* This ensures only one fellow resets the stat */ + if (ret == old) + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); } } -#define ADD_STATS(elem, val) \ - do { check_zero(); spinlock_stats.elem += (val); } while(0) +static inline void add_stats(enum xen_contention_stat var, u32 val) +{ + check_zero(); + spinlock_stats.contention_stats[var] += val; +} static inline u64 spin_time_start(void) { @@ -74,22 +73,6 @@ static void __spin_time_accum(u64 delta, u32 *array) array[HISTO_BUCKETS]++; } -static inline void spin_time_accum_spinning(u64 start) -{ - u32 delta = xen_clocksource_read() - start; - - __spin_time_accum(delta, spinlock_stats.histo_spin_spinning); - spinlock_stats.time_spinning += delta; -} - -static inline void spin_time_accum_total(u64 start) -{ - u32 delta = xen_clocksource_read() - start; - - __spin_time_accum(delta, spinlock_stats.histo_spin_total); - spinlock_stats.time_total += delta; -} - static inline void spin_time_accum_blocked(u64 start) { u32 delta = xen_clocksource_read() - start; @@ -98,263 +81,138 @@ static inline void spin_time_accum_blocked(u64 start) spinlock_stats.time_blocked += delta; } #else /* !CONFIG_XEN_DEBUG_FS */ -#define TIMEOUT (1 << 10) -#define ADD_STATS(elem, val) do { (void)(val); } while(0) +static inline void add_stats(enum xen_contention_stat var, u32 val) +{ +} static inline u64 spin_time_start(void) { return 0; } -static inline void spin_time_accum_total(u64 start) -{ -} -static inline void spin_time_accum_spinning(u64 start) -{ -} static inline void spin_time_accum_blocked(u64 start) { } #endif /* CONFIG_XEN_DEBUG_FS */ -/* - * Size struct xen_spinlock so it's the same as arch_spinlock_t. - */ -#if NR_CPUS < 256 -typedef u8 xen_spinners_t; -# define inc_spinners(xl) \ - asm(LOCK_PREFIX " incb %0" : "+m" ((xl)->spinners) : : "memory"); -# define dec_spinners(xl) \ - asm(LOCK_PREFIX " decb %0" : "+m" ((xl)->spinners) : : "memory"); -#else -typedef u16 xen_spinners_t; -# define inc_spinners(xl) \ - asm(LOCK_PREFIX " incw %0" : "+m" ((xl)->spinners) : : "memory"); -# define dec_spinners(xl) \ - asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory"); -#endif - -struct xen_spinlock { - unsigned char lock; /* 0 -> free; 1 -> locked */ - xen_spinners_t spinners; /* count of waiting cpus */ +struct xen_lock_waiting { + struct arch_spinlock *lock; + __ticket_t want; }; -static int xen_spin_is_locked(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - return xl->lock != 0; -} - -static int xen_spin_is_contended(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - /* Not strictly true; this is only the count of contended - lock-takers entering the slow path. */ - return xl->spinners != 0; -} - -static int xen_spin_trylock(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - u8 old = 1; - - asm("xchgb %b0,%1" - : "+q" (old), "+m" (xl->lock) : : "memory"); - - return old == 0; -} - -static DEFINE_PER_CPU(char *, irq_name); static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; -static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); - -/* - * Mark a cpu as interested in a lock. Returns the CPU's previous - * lock of interest, in case we got preempted by an interrupt. - */ -static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) -{ - struct xen_spinlock *prev; - - prev = __this_cpu_read(lock_spinners); - __this_cpu_write(lock_spinners, xl); - - wmb(); /* set lock of interest before count */ - - inc_spinners(xl); - - return prev; -} - -/* - * Mark a cpu as no longer interested in a lock. Restores previous - * lock of interest (NULL for none). - */ -static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev) -{ - dec_spinners(xl); - wmb(); /* decrement count before restoring lock */ - __this_cpu_write(lock_spinners, prev); -} +static DEFINE_PER_CPU(char *, irq_name); +static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); +static cpumask_t waiting_cpus; -static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) +static bool xen_pvspin = true; +static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) { - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - struct xen_spinlock *prev; int irq = __this_cpu_read(lock_kicker_irq); - int ret; + struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting); + int cpu = smp_processor_id(); u64 start; + unsigned long flags; /* If kicker interrupts not initialized yet, just spin */ if (irq == -1) - return 0; + return; start = spin_time_start(); - /* announce we're spinning */ - prev = spinning_lock(xl); + /* + * Make sure an interrupt handler can't upset things in a + * partially setup state. + */ + local_irq_save(flags); + /* + * We don't really care if we're overwriting some other + * (lock,want) pair, as that would mean that we're currently + * in an interrupt context, and the outer context had + * interrupts enabled. That has already kicked the VCPU out + * of xen_poll_irq(), so it will just return spuriously and + * retry with newly setup (lock,want). + * + * The ordering protocol on this is that the "lock" pointer + * may only be set non-NULL if the "want" ticket is correct. + * If we're updating "want", we must first clear "lock". + */ + w->lock = NULL; + smp_wmb(); + w->want = want; + smp_wmb(); + w->lock = lock; - ADD_STATS(taken_slow, 1); - ADD_STATS(taken_slow_nested, prev != NULL); + /* This uses set_bit, which atomic and therefore a barrier */ + cpumask_set_cpu(cpu, &waiting_cpus); + add_stats(TAKEN_SLOW, 1); - do { - unsigned long flags; + /* clear pending */ + xen_clear_irq_pending(irq); - /* clear pending */ - xen_clear_irq_pending(irq); + /* Only check lock once pending cleared */ + barrier(); - /* check again make sure it didn't become free while - we weren't looking */ - ret = xen_spin_trylock(lock); - if (ret) { - ADD_STATS(taken_slow_pickup, 1); + /* + * Mark entry to slowpath before doing the pickup test to make + * sure we don't deadlock with an unlocker. + */ + __ticket_enter_slowpath(lock); - /* - * If we interrupted another spinlock while it - * was blocking, make sure it doesn't block - * without rechecking the lock. - */ - if (prev != NULL) - xen_set_irq_pending(irq); - goto out; - } + /* + * check again make sure it didn't become free while + * we weren't looking + */ + if (ACCESS_ONCE(lock->tickets.head) == want) { + add_stats(TAKEN_SLOW_PICKUP, 1); + goto out; + } - flags = arch_local_save_flags(); - if (irq_enable) { - ADD_STATS(taken_slow_irqenable, 1); - raw_local_irq_enable(); - } + /* Allow interrupts while blocked */ + local_irq_restore(flags); - /* - * Block until irq becomes pending. If we're - * interrupted at this point (after the trylock but - * before entering the block), then the nested lock - * handler guarantees that the irq will be left - * pending if there's any chance the lock became free; - * xen_poll_irq() returns immediately if the irq is - * pending. - */ - xen_poll_irq(irq); + /* + * If an interrupt happens here, it will leave the wakeup irq + * pending, which will cause xen_poll_irq() to return + * immediately. + */ - raw_local_irq_restore(flags); + /* Block until irq becomes pending (or perhaps a spurious wakeup) */ + xen_poll_irq(irq); + add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq)); - ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); - } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ + local_irq_save(flags); kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); - out: - unspinning_lock(xl, prev); - spin_time_accum_blocked(start); - - return ret; -} - -static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - unsigned timeout; - u8 oldval; - u64 start_spin; - - ADD_STATS(taken, 1); - - start_spin = spin_time_start(); - - do { - u64 start_spin_fast = spin_time_start(); + cpumask_clear_cpu(cpu, &waiting_cpus); + w->lock = NULL; - timeout = TIMEOUT; + local_irq_restore(flags); - asm("1: xchgb %1,%0\n" - " testb %1,%1\n" - " jz 3f\n" - "2: rep;nop\n" - " cmpb $0,%0\n" - " je 1b\n" - " dec %2\n" - " jnz 2b\n" - "3:\n" - : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) - : "1" (1) - : "memory"); - - spin_time_accum_spinning(start_spin_fast); - - } while (unlikely(oldval != 0 && - (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable)))); - - spin_time_accum_total(start_spin); -} - -static void xen_spin_lock(struct arch_spinlock *lock) -{ - __xen_spin_lock(lock, false); -} - -static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags) -{ - __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); + spin_time_accum_blocked(start); } +PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning); -static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) +static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) { int cpu; - ADD_STATS(released_slow, 1); + add_stats(RELEASED_SLOW, 1); - for_each_online_cpu(cpu) { - /* XXX should mix up next cpu selection */ - if (per_cpu(lock_spinners, cpu) == xl) { - ADD_STATS(released_slow_kicked, 1); + for_each_cpu(cpu, &waiting_cpus) { + const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu); + + /* Make sure we read lock before want */ + if (ACCESS_ONCE(w->lock) == lock && + ACCESS_ONCE(w->want) == next) { + add_stats(RELEASED_SLOW_KICKED, 1); xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); + break; } } } -static void xen_spin_unlock(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - ADD_STATS(released, 1); - - smp_wmb(); /* make sure no writes get moved after unlock */ - xl->lock = 0; /* release lock */ - - /* - * Make sure unlock happens before checking for waiting - * spinners. We need a strong barrier to enforce the - * write-read ordering to different memory locations, as the - * CPU makes no implied guarantees about their ordering. - */ - mb(); - - if (unlikely(xl->spinners)) - xen_spin_unlock_slow(xl); -} - static irqreturn_t dummy_handler(int irq, void *dev_id) { BUG(); @@ -366,16 +224,12 @@ void xen_init_lock_cpu(int cpu) int irq; char *name; + if (!xen_pvspin) + return; + WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n", cpu, per_cpu(lock_kicker_irq, cpu)); - /* - * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23 - * (xen: disable PV spinlocks on HVM) - */ - if (xen_hvm_domain()) - return; - name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, cpu, @@ -395,11 +249,7 @@ void xen_init_lock_cpu(int cpu) void xen_uninit_lock_cpu(int cpu) { - /* - * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23 - * (xen: disable PV spinlocks on HVM) - */ - if (xen_hvm_domain()) + if (!xen_pvspin) return; unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL); @@ -408,24 +258,49 @@ void xen_uninit_lock_cpu(int cpu) per_cpu(irq_name, cpu) = NULL; } + +/* + * Our init of PV spinlocks is split in two init functions due to us + * using paravirt patching and jump labels patching and having to do + * all of this before SMP code is invoked. + * + * The paravirt patching needs to be done _before_ the alternative asm code + * is started, otherwise we would not patch the core kernel code. + */ void __init xen_init_spinlocks(void) { - /* - * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23 - * (xen: disable PV spinlocks on HVM) - */ - if (xen_hvm_domain()) + + if (!xen_pvspin) { + printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); return; + } + + pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); + pv_lock_ops.unlock_kick = xen_unlock_kick; +} + +/* + * While the jump_label init code needs to happend _after_ the jump labels are + * enabled and before SMP is started. Hence we use pre-SMP initcall level + * init. We cannot do it in xen_init_spinlocks as that is done before + * jump labels are activated. + */ +static __init int xen_init_spinlocks_jump(void) +{ + if (!xen_pvspin) + return 0; - BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t)); + static_key_slow_inc(¶virt_ticketlocks_enabled); + return 0; +} +early_initcall(xen_init_spinlocks_jump); - pv_lock_ops.spin_is_locked = xen_spin_is_locked; - pv_lock_ops.spin_is_contended = xen_spin_is_contended; - pv_lock_ops.spin_lock = xen_spin_lock; - pv_lock_ops.spin_lock_flags = xen_spin_lock_flags; - pv_lock_ops.spin_trylock = xen_spin_trylock; - pv_lock_ops.spin_unlock = xen_spin_unlock; +static __init int xen_parse_nopvspin(char *arg) +{ + xen_pvspin = false; + return 0; } +early_param("xen_nopvspin", xen_parse_nopvspin); #ifdef CONFIG_XEN_DEBUG_FS @@ -438,41 +313,28 @@ static int __init xen_spinlock_debugfs(void) if (d_xen == NULL) return -ENOMEM; + if (!xen_pvspin) + return 0; + d_spin_debug = debugfs_create_dir("spinlocks", d_xen); debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); - debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout); - - debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken); debugfs_create_u32("taken_slow", 0444, d_spin_debug, - &spinlock_stats.taken_slow); - debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug, - &spinlock_stats.taken_slow_nested); + &spinlock_stats.contention_stats[TAKEN_SLOW]); debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, - &spinlock_stats.taken_slow_pickup); + &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, - &spinlock_stats.taken_slow_spurious); - debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug, - &spinlock_stats.taken_slow_irqenable); + &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]); - debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released); debugfs_create_u32("released_slow", 0444, d_spin_debug, - &spinlock_stats.released_slow); + &spinlock_stats.contention_stats[RELEASED_SLOW]); debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, - &spinlock_stats.released_slow_kicked); + &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); - debugfs_create_u64("time_spinning", 0444, d_spin_debug, - &spinlock_stats.time_spinning); debugfs_create_u64("time_blocked", 0444, d_spin_debug, &spinlock_stats.time_blocked); - debugfs_create_u64("time_total", 0444, d_spin_debug, - &spinlock_stats.time_total); - debugfs_create_u32_array("histo_total", 0444, d_spin_debug, - spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); - debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, - spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 86782c5d7e2..95f8c614232 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -105,9 +105,9 @@ static inline void __init xen_init_apic(void) /* Declare an asm function, along with symbols needed to make it inlineable */ #define DECL_ASM(ret, name, ...) \ - ret name(__VA_ARGS__); \ - extern char name##_end[]; \ - extern char name##_reloc[] \ + __visible ret name(__VA_ARGS__); \ + extern char name##_end[] __visible; \ + extern char name##_reloc[] __visible DECL_ASM(void, xen_irq_enable_direct, void); DECL_ASM(void, xen_irq_disable_direct, void); @@ -115,11 +115,11 @@ DECL_ASM(unsigned long, xen_save_fl_direct, void); DECL_ASM(void, xen_restore_fl_direct, unsigned long); /* These are not functions, and cannot be called normally */ -void xen_iret(void); -void xen_sysexit(void); -void xen_sysret32(void); -void xen_sysret64(void); -void xen_adjust_exception_frame(void); +__visible void xen_iret(void); +__visible void xen_sysexit(void); +__visible void xen_sysret32(void); +__visible void xen_sysret64(void); +__visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); |