summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig14
-rw-r--r--arch/x86/Kconfig.debug9
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/boot/Makefile2
-rw-r--r--arch/x86/boot/compressed/eboot.c1017
-rw-r--r--arch/x86/boot/compressed/eboot.h60
-rw-r--r--arch/x86/boot/compressed/efi_stub_64.S29
-rw-r--r--arch/x86/boot/compressed/head_32.S52
-rw-r--r--arch/x86/boot/compressed/head_64.S111
-rw-r--r--arch/x86/boot/cpucheck.c20
-rw-r--r--arch/x86/boot/header.S25
-rw-r--r--arch/x86/boot/tools/build.c77
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/apic.h14
-rw-r--r--arch/x86/include/asm/cpufeature.h5
-rw-r--r--arch/x86/include/asm/efi.h43
-rw-r--r--arch/x86/include/asm/msr.h2
-rw-r--r--arch/x86/include/asm/nmi.h3
-rw-r--r--arch/x86/include/asm/pgtable.h17
-rw-r--r--arch/x86/include/asm/pgtable_types.h4
-rw-r--r--arch/x86/include/asm/special_insns.h8
-rw-r--r--arch/x86/include/asm/topology.h9
-rw-r--r--arch/x86/include/asm/xsave.h16
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h75
-rw-r--r--arch/x86/kernel/acpi/boot.c8
-rw-r--r--arch/x86/kernel/apic/apic.c13
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c4
-rw-r--r--arch/x86/kernel/apic/apic_noop.c3
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c2
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c3
-rw-r--r--arch/x86/kernel/apic/es7000_32.c12
-rw-r--r--arch/x86/kernel/apic/numaq_32.c3
-rw-r--r--arch/x86/kernel/apic/probe_32.c3
-rw-r--r--arch/x86/kernel/apic/summit_32.c3
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c50
-rw-r--r--arch/x86/kernel/cpu/intel.c53
-rw-r--r--arch/x86/kernel/cpu/perf_event.c47
-rw-r--r--arch/x86/kernel/cpu/perf_event.h8
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c544
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h5
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c34
-rw-r--r--arch/x86/kernel/crash.c2
-rw-r--r--arch/x86/kernel/dumpstack_32.c2
-rw-r--r--arch/x86/kernel/nmi.c37
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--arch/x86/kernel/setup.c6
-rw-r--r--arch/x86/kernel/smpboot.c16
-rw-r--r--arch/x86/lib/hash.c22
-rw-r--r--arch/x86/lib/msr.c89
-rw-r--r--arch/x86/mm/dump_pagetables.c84
-rw-r--r--arch/x86/mm/fault.c7
-rw-r--r--arch/x86/mm/pageattr.c70
-rw-r--r--arch/x86/platform/efi/Makefile1
-rw-r--r--arch/x86/platform/efi/efi.c415
-rw-r--r--arch/x86/platform/efi/efi_32.c7
-rw-r--r--arch/x86/platform/efi/efi_64.c368
-rw-r--r--arch/x86/platform/efi/efi_stub_64.S166
-rw-r--r--arch/x86/platform/efi/efi_thunk_64.S65
-rw-r--r--arch/x86/xen/mmu.c4
62 files changed, 3019 insertions, 766 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0af5250d914..8453fe1342e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1585,6 +1585,20 @@ config EFI_STUB
See Documentation/efi-stub.txt for more information.
+config EFI_MIXED
+ bool "EFI mixed-mode support"
+ depends on EFI_STUB && X86_64
+ ---help---
+ Enabling this feature allows a 64-bit kernel to be booted
+ on a 32-bit firmware, provided that your CPU supports 64-bit
+ mode.
+
+ Note that it is not possible to boot a mixed-mode enabled
+ kernel via the EFI boot stub - a bootloader that supports
+ the EFI handover protocol must be used.
+
+ If unsure, say N.
+
config SECCOMP
def_bool y
prompt "Enable seccomp to safely compute untrusted bytecode"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 321a52ccf63..61bd2ad9428 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -81,6 +81,15 @@ config X86_PTDUMP
kernel.
If in doubt, say "N"
+config EFI_PGT_DUMP
+ bool "Dump the EFI pagetable"
+ depends on EFI && X86_PTDUMP
+ ---help---
+ Enable this if you want to dump the EFI page table before
+ enabling virtual mode. This can be used to debug miscellaneous
+ issues with the mapping of the EFI runtime regions into that
+ table.
+
config DEBUG_RODATA
bool "Write protect kernel read-only data structures"
default y
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index eeda43abed6..3b9348a0c1a 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -82,8 +82,8 @@ else
KBUILD_AFLAGS += -m64
KBUILD_CFLAGS += -m64
- # Don't autogenerate MMX or SSE instructions
- KBUILD_CFLAGS += -mno-mmx -mno-sse
+ # Don't autogenerate traditional x87, MMX or SSE instructions
+ KBUILD_CFLAGS += -mno-mmx -mno-sse -mno-80387 -mno-fp-ret-in-387
# Use -mpreferred-stack-boundary=3 if supported.
KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3)
@@ -152,6 +152,7 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
# does binutils support specific instructions?
asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 878df7e88cd..abb9eba61b5 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -80,7 +80,7 @@ targets += voffset.h
$(obj)/voffset.h: vmlinux FORCE
$(call if_changed,voffset)
-sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|startup_64\|efi_pe_entry\|efi_stub_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
quiet_cmd_zoffset = ZOFFSET $@
cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index a7677babf94..1e6146137f8 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -19,10 +19,272 @@
static efi_system_table_t *sys_table;
+static struct efi_config *efi_early;
+
+#define efi_call_early(f, ...) \
+ efi_early->call(efi_early->f, __VA_ARGS__);
+
+#define BOOT_SERVICES(bits) \
+static void setup_boot_services##bits(struct efi_config *c) \
+{ \
+ efi_system_table_##bits##_t *table; \
+ efi_boot_services_##bits##_t *bt; \
+ \
+ table = (typeof(table))sys_table; \
+ \
+ c->text_output = table->con_out; \
+ \
+ bt = (typeof(bt))(unsigned long)(table->boottime); \
+ \
+ c->allocate_pool = bt->allocate_pool; \
+ c->allocate_pages = bt->allocate_pages; \
+ c->get_memory_map = bt->get_memory_map; \
+ c->free_pool = bt->free_pool; \
+ c->free_pages = bt->free_pages; \
+ c->locate_handle = bt->locate_handle; \
+ c->handle_protocol = bt->handle_protocol; \
+ c->exit_boot_services = bt->exit_boot_services; \
+}
+BOOT_SERVICES(32);
+BOOT_SERVICES(64);
-#include "../../../../drivers/firmware/efi/efi-stub-helper.c"
+static void efi_printk(efi_system_table_t *, char *);
+static void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
+
+static efi_status_t
+__file_size32(void *__fh, efi_char16_t *filename_16,
+ void **handle, u64 *file_sz)
+{
+ efi_file_handle_32_t *h, *fh = __fh;
+ efi_file_info_t *info;
+ efi_status_t status;
+ efi_guid_t info_guid = EFI_FILE_INFO_ID;
+ u32 info_sz;
+
+ status = efi_early->call((unsigned long)fh->open, fh, &h, filename_16,
+ EFI_FILE_MODE_READ, (u64)0);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to open file: ");
+ efi_char16_printk(sys_table, filename_16);
+ efi_printk(sys_table, "\n");
+ return status;
+ }
+
+ *handle = h;
+
+ info_sz = 0;
+ status = efi_early->call((unsigned long)h->get_info, h, &info_guid,
+ &info_sz, NULL);
+ if (status != EFI_BUFFER_TOO_SMALL) {
+ efi_printk(sys_table, "Failed to get file info size\n");
+ return status;
+ }
+
+grow:
+ status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
+ info_sz, (void **)&info);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to alloc mem for file info\n");
+ return status;
+ }
+
+ status = efi_early->call((unsigned long)h->get_info, h, &info_guid,
+ &info_sz, info);
+ if (status == EFI_BUFFER_TOO_SMALL) {
+ efi_call_early(free_pool, info);
+ goto grow;
+ }
+
+ *file_sz = info->file_size;
+ efi_call_early(free_pool, info);
+
+ if (status != EFI_SUCCESS)
+ efi_printk(sys_table, "Failed to get initrd info\n");
+
+ return status;
+}
+
+static efi_status_t
+__file_size64(void *__fh, efi_char16_t *filename_16,
+ void **handle, u64 *file_sz)
+{
+ efi_file_handle_64_t *h, *fh = __fh;
+ efi_file_info_t *info;
+ efi_status_t status;
+ efi_guid_t info_guid = EFI_FILE_INFO_ID;
+ u32 info_sz;
+ status = efi_early->call((unsigned long)fh->open, fh, &h, filename_16,
+ EFI_FILE_MODE_READ, (u64)0);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to open file: ");
+ efi_char16_printk(sys_table, filename_16);
+ efi_printk(sys_table, "\n");
+ return status;
+ }
+ *handle = h;
+
+ info_sz = 0;
+ status = efi_early->call((unsigned long)h->get_info, h, &info_guid,
+ &info_sz, NULL);
+ if (status != EFI_BUFFER_TOO_SMALL) {
+ efi_printk(sys_table, "Failed to get file info size\n");
+ return status;
+ }
+
+grow:
+ status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
+ info_sz, (void **)&info);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to alloc mem for file info\n");
+ return status;
+ }
+
+ status = efi_early->call((unsigned long)h->get_info, h, &info_guid,
+ &info_sz, info);
+ if (status == EFI_BUFFER_TOO_SMALL) {
+ efi_call_early(free_pool, info);
+ goto grow;
+ }
+
+ *file_sz = info->file_size;
+ efi_call_early(free_pool, info);
+
+ if (status != EFI_SUCCESS)
+ efi_printk(sys_table, "Failed to get initrd info\n");
+
+ return status;
+}
+static efi_status_t
+efi_file_size(efi_system_table_t *sys_table, void *__fh,
+ efi_char16_t *filename_16, void **handle, u64 *file_sz)
+{
+ if (efi_early->is64)
+ return __file_size64(__fh, filename_16, handle, file_sz);
+
+ return __file_size32(__fh, filename_16, handle, file_sz);
+}
+
+static inline efi_status_t
+efi_file_read(void *__fh, void *handle, unsigned long *size, void *addr)
+{
+ unsigned long func;
+
+ if (efi_early->is64) {
+ efi_file_handle_64_t *fh = __fh;
+
+ func = (unsigned long)fh->read;
+ return efi_early->call(func, handle, size, addr);
+ } else {
+ efi_file_handle_32_t *fh = __fh;
+
+ func = (unsigned long)fh->read;
+ return efi_early->call(func, handle, size, addr);
+ }
+}
+
+static inline efi_status_t efi_file_close(void *__fh, void *handle)
+{
+ if (efi_early->is64) {
+ efi_file_handle_64_t *fh = __fh;
+
+ return efi_early->call((unsigned long)fh->close, handle);
+ } else {
+ efi_file_handle_32_t *fh = __fh;
+
+ return efi_early->call((unsigned long)fh->close, handle);
+ }
+}
+
+static inline efi_status_t __open_volume32(void *__image, void **__fh)
+{
+ efi_file_io_interface_t *io;
+ efi_loaded_image_32_t *image = __image;
+ efi_file_handle_32_t *fh;
+ efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
+ efi_status_t status;
+ void *handle = (void *)(unsigned long)image->device_handle;
+ unsigned long func;
+
+ status = efi_call_early(handle_protocol, handle,
+ &fs_proto, (void **)&io);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to handle fs_proto\n");
+ return status;
+ }
+
+ func = (unsigned long)io->open_volume;
+ status = efi_early->call(func, io, &fh);
+ if (status != EFI_SUCCESS)
+ efi_printk(sys_table, "Failed to open volume\n");
+
+ *__fh = fh;
+ return status;
+}
+
+static inline efi_status_t __open_volume64(void *__image, void **__fh)
+{
+ efi_file_io_interface_t *io;
+ efi_loaded_image_64_t *image = __image;
+ efi_file_handle_64_t *fh;
+ efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
+ efi_status_t status;
+ void *handle = (void *)(unsigned long)image->device_handle;
+ unsigned long func;
+
+ status = efi_call_early(handle_protocol, handle,
+ &fs_proto, (void **)&io);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to handle fs_proto\n");
+ return status;
+ }
+
+ func = (unsigned long)io->open_volume;
+ status = efi_early->call(func, io, &fh);
+ if (status != EFI_SUCCESS)
+ efi_printk(sys_table, "Failed to open volume\n");
+
+ *__fh = fh;
+ return status;
+}
+
+static inline efi_status_t
+efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh)
+{
+ if (efi_early->is64)
+ return __open_volume64(__image, __fh);
+
+ return __open_volume32(__image, __fh);
+}
+
+static void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
+{
+ unsigned long output_string;
+ size_t offset;
+
+ if (efi_early->is64) {
+ struct efi_simple_text_output_protocol_64 *out;
+ u64 *func;
+
+ offset = offsetof(typeof(*out), output_string);
+ output_string = efi_early->text_output + offset;
+ func = (u64 *)output_string;
+
+ efi_early->call(*func, efi_early->text_output, str);
+ } else {
+ struct efi_simple_text_output_protocol_32 *out;
+ u32 *func;
+
+ offset = offsetof(typeof(*out), output_string);
+ output_string = efi_early->text_output + offset;
+ func = (u32 *)output_string;
+
+ efi_early->call(*func, efi_early->text_output, str);
+ }
+}
+
+#include "../../../../drivers/firmware/efi/efi-stub-helper.c"
static void find_bits(unsigned long mask, u8 *pos, u8 *size)
{
@@ -47,105 +309,97 @@ static void find_bits(unsigned long mask, u8 *pos, u8 *size)
*size = len;
}
-static efi_status_t setup_efi_pci(struct boot_params *params)
+static efi_status_t
+__setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom)
{
- efi_pci_io_protocol *pci;
+ struct pci_setup_rom *rom = NULL;
efi_status_t status;
- void **pci_handle;
- efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
- unsigned long nr_pci, size = 0;
- int i;
- struct setup_data *data;
+ unsigned long size;
+ uint64_t attributes;
- data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
+ status = efi_early->call(pci->attributes, pci,
+ EfiPciIoAttributeOperationGet, 0, 0,
+ &attributes);
+ if (status != EFI_SUCCESS)
+ return status;
- while (data && data->next)
- data = (struct setup_data *)(unsigned long)data->next;
+ if (!pci->romimage || !pci->romsize)
+ return EFI_INVALID_PARAMETER;
- status = efi_call_phys5(sys_table->boottime->locate_handle,
- EFI_LOCATE_BY_PROTOCOL, &pci_proto,
- NULL, &size, pci_handle);
+ size = pci->romsize + sizeof(*rom);
- if (status == EFI_BUFFER_TOO_SMALL) {
- status = efi_call_phys3(sys_table->boottime->allocate_pool,
- EFI_LOADER_DATA, size, &pci_handle);
+ status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom);
+ if (status != EFI_SUCCESS)
+ return status;
- if (status != EFI_SUCCESS)
- return status;
+ memset(rom, 0, sizeof(*rom));
- status = efi_call_phys5(sys_table->boottime->locate_handle,
- EFI_LOCATE_BY_PROTOCOL, &pci_proto,
- NULL, &size, pci_handle);
- }
+ rom->data.type = SETUP_PCI;
+ rom->data.len = size - sizeof(struct setup_data);
+ rom->data.next = 0;
+ rom->pcilen = pci->romsize;
+ *__rom = rom;
- if (status != EFI_SUCCESS)
- goto free_handle;
-
- nr_pci = size / sizeof(void *);
- for (i = 0; i < nr_pci; i++) {
- void *h = pci_handle[i];
- uint64_t attributes;
- struct pci_setup_rom *rom;
+ status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
+ PCI_VENDOR_ID, 1, &(rom->vendor));
- status = efi_call_phys3(sys_table->boottime->handle_protocol,
- h, &pci_proto, &pci);
+ if (status != EFI_SUCCESS)
+ goto free_struct;
- if (status != EFI_SUCCESS)
- continue;
+ status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
+ PCI_DEVICE_ID, 1, &(rom->devid));
- if (!pci)
- continue;
+ if (status != EFI_SUCCESS)
+ goto free_struct;
-#ifdef CONFIG_X86_64
- status = efi_call_phys4(pci->attributes, pci,
- EfiPciIoAttributeOperationGet, 0,
- &attributes);
-#else
- status = efi_call_phys5(pci->attributes, pci,
- EfiPciIoAttributeOperationGet, 0, 0,
- &attributes);
-#endif
- if (status != EFI_SUCCESS)
- continue;
+ status = efi_early->call(pci->get_location, pci, &(rom->segment),
+ &(rom->bus), &(rom->device), &(rom->function));
- if (!pci->romimage || !pci->romsize)
- continue;
+ if (status != EFI_SUCCESS)
+ goto free_struct;
- size = pci->romsize + sizeof(*rom);
+ memcpy(rom->romdata, pci->romimage, pci->romsize);
+ return status;
- status = efi_call_phys3(sys_table->boottime->allocate_pool,
- EFI_LOADER_DATA, size, &rom);
+free_struct:
+ efi_call_early(free_pool, rom);
+ return status;
+}
- if (status != EFI_SUCCESS)
- continue;
+static efi_status_t
+setup_efi_pci32(struct boot_params *params, void **pci_handle,
+ unsigned long size)
+{
+ efi_pci_io_protocol_32 *pci = NULL;
+ efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
+ u32 *handles = (u32 *)(unsigned long)pci_handle;
+ efi_status_t status;
+ unsigned long nr_pci;
+ struct setup_data *data;
+ int i;
- rom->data.type = SETUP_PCI;
- rom->data.len = size - sizeof(struct setup_data);
- rom->data.next = 0;
- rom->pcilen = pci->romsize;
+ data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
- status = efi_call_phys5(pci->pci.read, pci,
- EfiPciIoWidthUint16, PCI_VENDOR_ID,
- 1, &(rom->vendor));
+ while (data && data->next)
+ data = (struct setup_data *)(unsigned long)data->next;
- if (status != EFI_SUCCESS)
- goto free_struct;
+ nr_pci = size / sizeof(u32);
+ for (i = 0; i < nr_pci; i++) {
+ struct pci_setup_rom *rom = NULL;
+ u32 h = handles[i];
- status = efi_call_phys5(pci->pci.read, pci,
- EfiPciIoWidthUint16, PCI_DEVICE_ID,
- 1, &(rom->devid));
+ status = efi_call_early(handle_protocol, h,
+ &pci_proto, (void **)&pci);
if (status != EFI_SUCCESS)
- goto free_struct;
+ continue;
- status = efi_call_phys5(pci->get_location, pci,
- &(rom->segment), &(rom->bus),
- &(rom->device), &(rom->function));
+ if (!pci)
+ continue;
+ status = __setup_efi_pci32(pci, &rom);
if (status != EFI_SUCCESS)
- goto free_struct;
-
- memcpy(rom->romdata, pci->romimage, pci->romsize);
+ continue;
if (data)
data->next = (unsigned long)rom;
@@ -154,105 +408,155 @@ static efi_status_t setup_efi_pci(struct boot_params *params)
data = (struct setup_data *)rom;
- continue;
- free_struct:
- efi_call_phys1(sys_table->boottime->free_pool, rom);
}
-free_handle:
- efi_call_phys1(sys_table->boottime->free_pool, pci_handle);
return status;
}
-/*
- * See if we have Graphics Output Protocol
- */
-static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,
- unsigned long size)
+static efi_status_t
+__setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom)
{
- struct efi_graphics_output_protocol *gop, *first_gop;
- struct efi_pixel_bitmask pixel_info;
- unsigned long nr_gops;
+ struct pci_setup_rom *rom;
efi_status_t status;
- void **gop_handle;
- u16 width, height;
- u32 fb_base, fb_size;
- u32 pixels_per_scan_line;
- int pixel_format;
- int i;
+ unsigned long size;
+ uint64_t attributes;
- status = efi_call_phys3(sys_table->boottime->allocate_pool,
- EFI_LOADER_DATA, size, &gop_handle);
+ status = efi_early->call(pci->attributes, pci,
+ EfiPciIoAttributeOperationGet, 0,
+ &attributes);
if (status != EFI_SUCCESS)
return status;
- status = efi_call_phys5(sys_table->boottime->locate_handle,
- EFI_LOCATE_BY_PROTOCOL, proto,
- NULL, &size, gop_handle);
+ if (!pci->romimage || !pci->romsize)
+ return EFI_INVALID_PARAMETER;
+
+ size = pci->romsize + sizeof(*rom);
+
+ status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom);
if (status != EFI_SUCCESS)
- goto free_handle;
+ return status;
- first_gop = NULL;
+ rom->data.type = SETUP_PCI;
+ rom->data.len = size - sizeof(struct setup_data);
+ rom->data.next = 0;
+ rom->pcilen = pci->romsize;
+ *__rom = rom;
- nr_gops = size / sizeof(void *);
- for (i = 0; i < nr_gops; i++) {
- struct efi_graphics_output_mode_info *info;
- efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID;
- bool conout_found = false;
- void *dummy;
- void *h = gop_handle[i];
+ status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
+ PCI_VENDOR_ID, 1, &(rom->vendor));
+
+ if (status != EFI_SUCCESS)
+ goto free_struct;
+
+ status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
+ PCI_DEVICE_ID, 1, &(rom->devid));
+
+ if (status != EFI_SUCCESS)
+ goto free_struct;
+
+ status = efi_early->call(pci->get_location, pci, &(rom->segment),
+ &(rom->bus), &(rom->device), &(rom->function));
+
+ if (status != EFI_SUCCESS)
+ goto free_struct;
+
+ memcpy(rom->romdata, pci->romimage, pci->romsize);
+ return status;
+
+free_struct:
+ efi_call_early(free_pool, rom);
+ return status;
+
+}
+
+static efi_status_t
+setup_efi_pci64(struct boot_params *params, void **pci_handle,
+ unsigned long size)
+{
+ efi_pci_io_protocol_64 *pci = NULL;
+ efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
+ u64 *handles = (u64 *)(unsigned long)pci_handle;
+ efi_status_t status;
+ unsigned long nr_pci;
+ struct setup_data *data;
+ int i;
+
+ data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
+
+ while (data && data->next)
+ data = (struct setup_data *)(unsigned long)data->next;
+
+ nr_pci = size / sizeof(u64);
+ for (i = 0; i < nr_pci; i++) {
+ struct pci_setup_rom *rom = NULL;
+ u64 h = handles[i];
+
+ status = efi_call_early(handle_protocol, h,
+ &pci_proto, (void **)&pci);
- status = efi_call_phys3(sys_table->boottime->handle_protocol,
- h, proto, &gop);
if (status != EFI_SUCCESS)
continue;
- status = efi_call_phys3(sys_table->boottime->handle_protocol,
- h, &conout_proto, &dummy);
+ if (!pci)
+ continue;
- if (status == EFI_SUCCESS)
- conout_found = true;
+ status = __setup_efi_pci64(pci, &rom);
+ if (status != EFI_SUCCESS)
+ continue;
- status = efi_call_phys4(gop->query_mode, gop,
- gop->mode->mode, &size, &info);
- if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
- /*
- * Systems that use the UEFI Console Splitter may
- * provide multiple GOP devices, not all of which are
- * backed by real hardware. The workaround is to search
- * for a GOP implementing the ConOut protocol, and if
- * one isn't found, to just fall back to the first GOP.
- */
- width = info->horizontal_resolution;
- height = info->vertical_resolution;
- fb_base = gop->mode->frame_buffer_base;
- fb_size = gop->mode->frame_buffer_size;
- pixel_format = info->pixel_format;
- pixel_info = info->pixel_information;
- pixels_per_scan_line = info->pixels_per_scan_line;
+ if (data)
+ data->next = (unsigned long)rom;
+ else
+ params->hdr.setup_data = (unsigned long)rom;
+
+ data = (struct setup_data *)rom;
- /*
- * Once we've found a GOP supporting ConOut,
- * don't bother looking any further.
- */
- first_gop = gop;
- if (conout_found)
- break;
- }
}
- /* Did we find any GOPs? */
- if (!first_gop)
+ return status;
+}
+
+static efi_status_t setup_efi_pci(struct boot_params *params)
+{
+ efi_status_t status;
+ void **pci_handle = NULL;
+ efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
+ unsigned long size = 0;
+
+ status = efi_call_early(locate_handle,
+ EFI_LOCATE_BY_PROTOCOL,
+ &pci_proto, NULL, &size, pci_handle);
+
+ if (status == EFI_BUFFER_TOO_SMALL) {
+ status = efi_call_early(allocate_pool,
+ EFI_LOADER_DATA,
+ size, (void **)&pci_handle);
+
+ if (status != EFI_SUCCESS)
+ return status;
+
+ status = efi_call_early(locate_handle,
+ EFI_LOCATE_BY_PROTOCOL, &pci_proto,
+ NULL, &size, pci_handle);
+ }
+
+ if (status != EFI_SUCCESS)
goto free_handle;
- /* EFI framebuffer */
- si->orig_video_isVGA = VIDEO_TYPE_EFI;
+ if (efi_early->is64)
+ status = setup_efi_pci64(params, pci_handle, size);
+ else
+ status = setup_efi_pci32(params, pci_handle, size);
- si->lfb_width = width;
- si->lfb_height = height;
- si->lfb_base = fb_base;
- si->pages = 1;
+free_handle:
+ efi_call_early(free_pool, pci_handle);
+ return status;
+}
+static void
+setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line,
+ struct efi_pixel_bitmask pixel_info, int pixel_format)
+{
if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
si->lfb_depth = 32;
si->lfb_linelength = pixels_per_scan_line * 4;
@@ -297,62 +601,319 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,
si->rsvd_size = 0;
si->rsvd_pos = 0;
}
+}
+
+static efi_status_t
+__gop_query32(struct efi_graphics_output_protocol_32 *gop32,
+ struct efi_graphics_output_mode_info **info,
+ unsigned long *size, u32 *fb_base)
+{
+ struct efi_graphics_output_protocol_mode_32 *mode;
+ efi_status_t status;
+ unsigned long m;
+
+ m = gop32->mode;
+ mode = (struct efi_graphics_output_protocol_mode_32 *)m;
+
+ status = efi_early->call(gop32->query_mode, gop32,
+ mode->mode, size, info);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ *fb_base = mode->frame_buffer_base;
+ return status;
+}
+
+static efi_status_t
+setup_gop32(struct screen_info *si, efi_guid_t *proto,
+ unsigned long size, void **gop_handle)
+{
+ struct efi_graphics_output_protocol_32 *gop32, *first_gop;
+ unsigned long nr_gops;
+ u16 width, height;
+ u32 pixels_per_scan_line;
+ u32 fb_base;
+ struct efi_pixel_bitmask pixel_info;
+ int pixel_format;
+ efi_status_t status;
+ u32 *handles = (u32 *)(unsigned long)gop_handle;
+ int i;
+
+ first_gop = NULL;
+ gop32 = NULL;
+
+ nr_gops = size / sizeof(u32);
+ for (i = 0; i < nr_gops; i++) {
+ struct efi_graphics_output_mode_info *info = NULL;
+ efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID;
+ bool conout_found = false;
+ void *dummy = NULL;
+ u32 h = handles[i];
+
+ status = efi_call_early(handle_protocol, h,
+ proto, (void **)&gop32);
+ if (status != EFI_SUCCESS)
+ continue;
+
+ status = efi_call_early(handle_protocol, h,
+ &conout_proto, &dummy);
+ if (status == EFI_SUCCESS)
+ conout_found = true;
+
+ status = __gop_query32(gop32, &info, &size, &fb_base);
+ if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
+ /*
+ * Systems that use the UEFI Console Splitter may
+ * provide multiple GOP devices, not all of which are
+ * backed by real hardware. The workaround is to search
+ * for a GOP implementing the ConOut protocol, and if
+ * one isn't found, to just fall back to the first GOP.
+ */
+ width = info->horizontal_resolution;
+ height = info->vertical_resolution;
+ pixel_format = info->pixel_format;
+ pixel_info = info->pixel_information;
+ pixels_per_scan_line = info->pixels_per_scan_line;
+
+ /*
+ * Once we've found a GOP supporting ConOut,
+ * don't bother looking any further.
+ */
+ first_gop = gop32;
+ if (conout_found)
+ break;
+ }
+ }
+
+ /* Did we find any GOPs? */
+ if (!first_gop)
+ goto out;
+
+ /* EFI framebuffer */
+ si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+ si->lfb_width = width;
+ si->lfb_height = height;
+ si->lfb_base = fb_base;
+ si->pages = 1;
+
+ setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format);
si->lfb_size = si->lfb_linelength * si->lfb_height;
si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
+out:
+ return status;
+}
-free_handle:
- efi_call_phys1(sys_table->boottime->free_pool, gop_handle);
+static efi_status_t
+__gop_query64(struct efi_graphics_output_protocol_64 *gop64,
+ struct efi_graphics_output_mode_info **info,
+ unsigned long *size, u32 *fb_base)
+{
+ struct efi_graphics_output_protocol_mode_64 *mode;
+ efi_status_t status;
+ unsigned long m;
+
+ m = gop64->mode;
+ mode = (struct efi_graphics_output_protocol_mode_64 *)m;
+
+ status = efi_early->call(gop64->query_mode, gop64,
+ mode->mode, size, info);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ *fb_base = mode->frame_buffer_base;
+ return status;
+}
+
+static efi_status_t
+setup_gop64(struct screen_info *si, efi_guid_t *proto,
+ unsigned long size, void **gop_handle)
+{
+ struct efi_graphics_output_protocol_64 *gop64, *first_gop;
+ unsigned long nr_gops;
+ u16 width, height;
+ u32 pixels_per_scan_line;
+ u32 fb_base;
+ struct efi_pixel_bitmask pixel_info;
+ int pixel_format;
+ efi_status_t status;
+ u64 *handles = (u64 *)(unsigned long)gop_handle;
+ int i;
+
+ first_gop = NULL;
+ gop64 = NULL;
+
+ nr_gops = size / sizeof(u64);
+ for (i = 0; i < nr_gops; i++) {
+ struct efi_graphics_output_mode_info *info = NULL;
+ efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID;
+ bool conout_found = false;
+ void *dummy = NULL;
+ u64 h = handles[i];
+
+ status = efi_call_early(handle_protocol, h,
+ proto, (void **)&gop64);
+ if (status != EFI_SUCCESS)
+ continue;
+
+ status = efi_call_early(handle_protocol, h,
+ &conout_proto, &dummy);
+ if (status == EFI_SUCCESS)
+ conout_found = true;
+
+ status = __gop_query64(gop64, &info, &size, &fb_base);
+ if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
+ /*
+ * Systems that use the UEFI Console Splitter may
+ * provide multiple GOP devices, not all of which are
+ * backed by real hardware. The workaround is to search
+ * for a GOP implementing the ConOut protocol, and if
+ * one isn't found, to just fall back to the first GOP.
+ */
+ width = info->horizontal_resolution;
+ height = info->vertical_resolution;
+ pixel_format = info->pixel_format;
+ pixel_info = info->pixel_information;
+ pixels_per_scan_line = info->pixels_per_scan_line;
+
+ /*
+ * Once we've found a GOP supporting ConOut,
+ * don't bother looking any further.
+ */
+ first_gop = gop64;
+ if (conout_found)
+ break;
+ }
+ }
+
+ /* Did we find any GOPs? */
+ if (!first_gop)
+ goto out;
+
+ /* EFI framebuffer */
+ si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+ si->lfb_width = width;
+ si->lfb_height = height;
+ si->lfb_base = fb_base;
+ si->pages = 1;
+
+ setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format);
+
+ si->lfb_size = si->lfb_linelength * si->lfb_height;
+
+ si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
+out:
return status;
}
/*
- * See if we have Universal Graphics Adapter (UGA) protocol
+ * See if we have Graphics Output Protocol
*/
-static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto,
+static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,
unsigned long size)
{
- struct efi_uga_draw_protocol *uga, *first_uga;
- unsigned long nr_ugas;
efi_status_t status;
- u32 width, height;
- void **uga_handle = NULL;
- int i;
+ void **gop_handle = NULL;
- status = efi_call_phys3(sys_table->boottime->allocate_pool,
- EFI_LOADER_DATA, size, &uga_handle);
+ status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
+ size, (void **)&gop_handle);
if (status != EFI_SUCCESS)
return status;
- status = efi_call_phys5(sys_table->boottime->locate_handle,
- EFI_LOCATE_BY_PROTOCOL, uga_proto,
- NULL, &size, uga_handle);
+ status = efi_call_early(locate_handle,
+ EFI_LOCATE_BY_PROTOCOL,
+ proto, NULL, &size, gop_handle);
if (status != EFI_SUCCESS)
goto free_handle;
+ if (efi_early->is64)
+ status = setup_gop64(si, proto, size, gop_handle);
+ else
+ status = setup_gop32(si, proto, size, gop_handle);
+
+free_handle:
+ efi_call_early(free_pool, gop_handle);
+ return status;
+}
+
+static efi_status_t
+setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height)
+{
+ struct efi_uga_draw_protocol *uga = NULL, *first_uga;
+ efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
+ unsigned long nr_ugas;
+ u32 *handles = (u32 *)uga_handle;;
+ efi_status_t status;
+ int i;
+
first_uga = NULL;
+ nr_ugas = size / sizeof(u32);
+ for (i = 0; i < nr_ugas; i++) {
+ efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
+ u32 w, h, depth, refresh;
+ void *pciio;
+ u32 handle = handles[i];
+
+ status = efi_call_early(handle_protocol, handle,
+ &uga_proto, (void **)&uga);
+ if (status != EFI_SUCCESS)
+ continue;
+
+ efi_call_early(handle_protocol, handle, &pciio_proto, &pciio);
+
+ status = efi_early->call((unsigned long)uga->get_mode, uga,
+ &w, &h, &depth, &refresh);
+ if (status == EFI_SUCCESS && (!first_uga || pciio)) {
+ *width = w;
+ *height = h;
+
+ /*
+ * Once we've found a UGA supporting PCIIO,
+ * don't bother looking any further.
+ */
+ if (pciio)
+ break;
- nr_ugas = size / sizeof(void *);
+ first_uga = uga;
+ }
+ }
+
+ return status;
+}
+
+static efi_status_t
+setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height)
+{
+ struct efi_uga_draw_protocol *uga = NULL, *first_uga;
+ efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
+ unsigned long nr_ugas;
+ u64 *handles = (u64 *)uga_handle;;
+ efi_status_t status;
+ int i;
+
+ first_uga = NULL;
+ nr_ugas = size / sizeof(u64);
for (i = 0; i < nr_ugas; i++) {
efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
- void *handle = uga_handle[i];
u32 w, h, depth, refresh;
void *pciio;
+ u64 handle = handles[i];
- status = efi_call_phys3(sys_table->boottime->handle_protocol,
- handle, uga_proto, &uga);
+ status = efi_call_early(handle_protocol, handle,
+ &uga_proto, (void **)&uga);
if (status != EFI_SUCCESS)
continue;
- efi_call_phys3(sys_table->boottime->handle_protocol,
- handle, &pciio_proto, &pciio);
+ efi_call_early(handle_protocol, handle, &pciio_proto, &pciio);
- status = efi_call_phys5(uga->get_mode, uga, &w, &h,
- &depth, &refresh);
+ status = efi_early->call((unsigned long)uga->get_mode, uga,
+ &w, &h, &depth, &refresh);
if (status == EFI_SUCCESS && (!first_uga || pciio)) {
- width = w;
- height = h;
+ *width = w;
+ *height = h;
/*
* Once we've found a UGA supporting PCIIO,
@@ -365,7 +926,39 @@ static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto,
}
}
- if (!first_uga)
+ return status;
+}
+
+/*
+ * See if we have Universal Graphics Adapter (UGA) protocol
+ */
+static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto,
+ unsigned long size)
+{
+ efi_status_t status;
+ u32 width, height;
+ void **uga_handle = NULL;
+
+ status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
+ size, (void **)&uga_handle);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ status = efi_call_early(locate_handle,
+ EFI_LOCATE_BY_PROTOCOL,
+ uga_proto, NULL, &size, uga_handle);
+ if (status != EFI_SUCCESS)
+ goto free_handle;
+
+ height = 0;
+ width = 0;
+
+ if (efi_early->is64)
+ status = setup_uga64(uga_handle, size, &width, &height);
+ else
+ status = setup_uga32(uga_handle, size, &width, &height);
+
+ if (!width && !height)
goto free_handle;
/* EFI framebuffer */
@@ -384,9 +977,8 @@ static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto,
si->rsvd_size = 8;
si->rsvd_pos = 24;
-
free_handle:
- efi_call_phys1(sys_table->boottime->free_pool, uga_handle);
+ efi_call_early(free_pool, uga_handle);
return status;
}
@@ -404,29 +996,28 @@ void setup_graphics(struct boot_params *boot_params)
memset(si, 0, sizeof(*si));
size = 0;
- status = efi_call_phys5(sys_table->boottime->locate_handle,
- EFI_LOCATE_BY_PROTOCOL, &graphics_proto,
- NULL, &size, gop_handle);
+ status = efi_call_early(locate_handle,
+ EFI_LOCATE_BY_PROTOCOL,
+ &graphics_proto, NULL, &size, gop_handle);
if (status == EFI_BUFFER_TOO_SMALL)
status = setup_gop(si, &graphics_proto, size);
if (status != EFI_SUCCESS) {
size = 0;
- status = efi_call_phys5(sys_table->boottime->locate_handle,
- EFI_LOCATE_BY_PROTOCOL, &uga_proto,
- NULL, &size, uga_handle);
+ status = efi_call_early(locate_handle,
+ EFI_LOCATE_BY_PROTOCOL,
+ &uga_proto, NULL, &size, uga_handle);
if (status == EFI_BUFFER_TOO_SMALL)
setup_uga(si, &uga_proto, size);
}
}
-
/*
* Because the x86 boot code expects to be passed a boot_params we
* need to create one ourselves (usually the bootloader would create
* one for us).
*/
-struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
+struct boot_params *make_boot_params(struct efi_config *c)
{
struct boot_params *boot_params;
struct sys_desc_table *sdt;
@@ -434,7 +1025,7 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
struct setup_header *hdr;
struct efi_info *efi;
efi_loaded_image_t *image;
- void *options;
+ void *options, *handle;
efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
int options_size = 0;
efi_status_t status;
@@ -445,14 +1036,21 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
unsigned long ramdisk_addr;
unsigned long ramdisk_size;
- sys_table = _table;
+ efi_early = c;
+ sys_table = (efi_system_table_t *)(unsigned long)efi_early->table;
+ handle = (void *)(unsigned long)efi_early->image_handle;
/* Check if we were booted by the EFI firmware */
if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
return NULL;
- status = efi_call_phys3(sys_table->boottime->handle_protocol,
- handle, &proto, (void *)&image);
+ if (efi_early->is64)
+ setup_boot_services64(efi_early);
+ else
+ setup_boot_services32(efi_early);
+
+ status = efi_call_early(handle_protocol, handle,
+ &proto, (void *)&image);
if (status != EFI_SUCCESS) {
efi_printk(sys_table, "Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
return NULL;
@@ -641,14 +1239,13 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext,
sizeof(struct e820entry) * nr_desc;
if (*e820ext) {
- efi_call_phys1(sys_table->boottime->free_pool, *e820ext);
+ efi_call_early(free_pool, *e820ext);
*e820ext = NULL;
*e820ext_size = 0;
}
- status = efi_call_phys3(sys_table->boottime->allocate_pool,
- EFI_LOADER_DATA, size, e820ext);
-
+ status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
+ size, (void **)e820ext);
if (status == EFI_SUCCESS)
*e820ext_size = size;
@@ -656,12 +1253,13 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext,
}
static efi_status_t exit_boot(struct boot_params *boot_params,
- void *handle)
+ void *handle, bool is64)
{
struct efi_info *efi = &boot_params->efi_info;
unsigned long map_sz, key, desc_size;
efi_memory_desc_t *mem_map;
struct setup_data *e820ext;
+ const char *signature;
__u32 e820ext_size;
__u32 nr_desc, prev_nr_desc;
efi_status_t status;
@@ -691,11 +1289,13 @@ get_map:
if (status != EFI_SUCCESS)
goto free_mem_map;
- efi_call_phys1(sys_table->boottime->free_pool, mem_map);
+ efi_call_early(free_pool, mem_map);
goto get_map; /* Allocated memory, get map again */
}
- memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32));
+ signature = is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE;
+ memcpy(&efi->efi_loader_signature, signature, sizeof(__u32));
+
efi->efi_systab = (unsigned long)sys_table;
efi->efi_memdesc_size = desc_size;
efi->efi_memdesc_version = desc_version;
@@ -708,8 +1308,7 @@ get_map:
#endif
/* Might as well exit boot services now */
- status = efi_call_phys2(sys_table->boottime->exit_boot_services,
- handle, key);
+ status = efi_call_early(exit_boot_services, handle, key);
if (status != EFI_SUCCESS) {
/*
* ExitBootServices() will fail if any of the event
@@ -722,7 +1321,7 @@ get_map:
goto free_mem_map;
called_exit = true;
- efi_call_phys1(sys_table->boottime->free_pool, mem_map);
+ efi_call_early(free_pool, mem_map);
goto get_map;
}
@@ -736,23 +1335,31 @@ get_map:
return EFI_SUCCESS;
free_mem_map:
- efi_call_phys1(sys_table->boottime->free_pool, mem_map);
+ efi_call_early(free_pool, mem_map);
return status;
}
-
/*
* On success we return a pointer to a boot_params structure, and NULL
* on failure.
*/
-struct boot_params *efi_main(void *handle, efi_system_table_t *_table,
+struct boot_params *efi_main(struct efi_config *c,
struct boot_params *boot_params)
{
- struct desc_ptr *gdt;
+ struct desc_ptr *gdt = NULL;
efi_loaded_image_t *image;
struct setup_header *hdr = &boot_params->hdr;
efi_status_t status;
struct desc_struct *desc;
+ void *handle;
+ efi_system_table_t *_table;
+ bool is64;
+
+ efi_early = c;
+
+ _table = (efi_system_table_t *)(unsigned long)efi_early->table;
+ handle = (void *)(unsigned long)efi_early->image_handle;
+ is64 = efi_early->is64;
sys_table = _table;
@@ -760,13 +1367,17 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table,
if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
goto fail;
+ if (is64)
+ setup_boot_services64(efi_early);
+ else
+ setup_boot_services32(efi_early);
+
setup_graphics(boot_params);
setup_efi_pci(boot_params);
- status = efi_call_phys3(sys_table->boottime->allocate_pool,
- EFI_LOADER_DATA, sizeof(*gdt),
- (void **)&gdt);
+ status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
+ sizeof(*gdt), (void **)&gdt);
if (status != EFI_SUCCESS) {
efi_printk(sys_table, "Failed to alloc mem for gdt structure\n");
goto fail;
@@ -797,7 +1408,7 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table,
hdr->code32_start = bzimage_addr;
}
- status = exit_boot(boot_params, handle);
+ status = exit_boot(boot_params, handle, is64);
if (status != EFI_SUCCESS)
goto fail;
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
index 81b6b652b46..c88c31ecad1 100644
--- a/arch/x86/boot/compressed/eboot.h
+++ b/arch/x86/boot/compressed/eboot.h
@@ -37,6 +37,24 @@ struct efi_graphics_output_mode_info {
u32 pixels_per_scan_line;
} __packed;
+struct efi_graphics_output_protocol_mode_32 {
+ u32 max_mode;
+ u32 mode;
+ u32 info;
+ u32 size_of_info;
+ u64 frame_buffer_base;
+ u32 frame_buffer_size;
+} __packed;
+
+struct efi_graphics_output_protocol_mode_64 {
+ u32 max_mode;
+ u32 mode;
+ u64 info;
+ u64 size_of_info;
+ u64 frame_buffer_base;
+ u64 frame_buffer_size;
+} __packed;
+
struct efi_graphics_output_protocol_mode {
u32 max_mode;
u32 mode;
@@ -46,6 +64,20 @@ struct efi_graphics_output_protocol_mode {
unsigned long frame_buffer_size;
} __packed;
+struct efi_graphics_output_protocol_32 {
+ u32 query_mode;
+ u32 set_mode;
+ u32 blt;
+ u32 mode;
+};
+
+struct efi_graphics_output_protocol_64 {
+ u64 query_mode;
+ u64 set_mode;
+ u64 blt;
+ u64 mode;
+};
+
struct efi_graphics_output_protocol {
void *query_mode;
unsigned long set_mode;
@@ -53,10 +85,38 @@ struct efi_graphics_output_protocol {
struct efi_graphics_output_protocol_mode *mode;
};
+struct efi_uga_draw_protocol_32 {
+ u32 get_mode;
+ u32 set_mode;
+ u32 blt;
+};
+
+struct efi_uga_draw_protocol_64 {
+ u64 get_mode;
+ u64 set_mode;
+ u64 blt;
+};
+
struct efi_uga_draw_protocol {
void *get_mode;
void *set_mode;
void *blt;
};
+struct efi_config {
+ u64 image_handle;
+ u64 table;
+ u64 allocate_pool;
+ u64 allocate_pages;
+ u64 get_memory_map;
+ u64 free_pool;
+ u64 free_pages;
+ u64 locate_handle;
+ u64 handle_protocol;
+ u64 exit_boot_services;
+ u64 text_output;
+ efi_status_t (*call)(unsigned long, ...);
+ bool is64;
+} __packed;
+
#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S
index cedc60de86e..7ff3632806b 100644
--- a/arch/x86/boot/compressed/efi_stub_64.S
+++ b/arch/x86/boot/compressed/efi_stub_64.S
@@ -1 +1,30 @@
+#include <asm/segment.h>
+#include <asm/msr.h>
+#include <asm/processor-flags.h>
+
#include "../../platform/efi/efi_stub_64.S"
+
+#ifdef CONFIG_EFI_MIXED
+ .code64
+ .text
+ENTRY(efi64_thunk)
+ push %rbp
+ push %rbx
+
+ subq $16, %rsp
+ leaq efi_exit32(%rip), %rax
+ movl %eax, 8(%rsp)
+ leaq efi_gdt64(%rip), %rax
+ movl %eax, 4(%rsp)
+ movl %eax, 2(%rax) /* Fixup the gdt base address */
+ leaq efi32_boot_gdt(%rip), %rax
+ movl %eax, (%rsp)
+
+ call __efi64_thunk
+
+ addq $16, %rsp
+ pop %rbx
+ pop %rbp
+ ret
+ENDPROC(efi64_thunk)
+#endif /* CONFIG_EFI_MIXED */
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 9116aac232c..de9d4200d30 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -42,26 +42,53 @@ ENTRY(startup_32)
ENTRY(efi_pe_entry)
add $0x4, %esp
+ call 1f
+1: popl %esi
+ subl $1b, %esi
+
+ popl %ecx
+ movl %ecx, efi32_config(%esi) /* Handle */
+ popl %ecx
+ movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */
+
+ /* Relocate efi_config->call() */
+ leal efi32_config(%esi), %eax
+ add %esi, 88(%eax)
+ pushl %eax
+
call make_boot_params
cmpl $0, %eax
- je 1f
- movl 0x4(%esp), %esi
- movl (%esp), %ecx
+ je fail
+ popl %ecx
pushl %eax
- pushl %esi
pushl %ecx
- sub $0x4, %esp
+ jmp 2f /* Skip efi_config initialization */
-ENTRY(efi_stub_entry)
+ENTRY(efi32_stub_entry)
add $0x4, %esp
+ popl %ecx
+ popl %edx
+
+ call 1f
+1: popl %esi
+ subl $1b, %esi
+
+ movl %ecx, efi32_config(%esi) /* Handle */
+ movl %edx, efi32_config+8(%esi) /* EFI System table pointer */
+
+ /* Relocate efi_config->call() */
+ leal efi32_config(%esi), %eax
+ add %esi, 88(%eax)
+ pushl %eax
+2:
call efi_main
cmpl $0, %eax
movl %eax, %esi
jne 2f
-1:
+fail:
/* EFI init failed, so hang. */
hlt
- jmp 1b
+ jmp fail
2:
call 3f
3:
@@ -202,6 +229,15 @@ relocated:
xorl %ebx, %ebx
jmp *%eax
+#ifdef CONFIG_EFI_STUB
+ .data
+efi32_config:
+ .fill 11,8,0
+ .long efi_call_phys
+ .long 0
+ .byte 0
+#endif
+
/*
* Stack and heap for uncompression
*/
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index c5c1ae0997e..57e58a5fa21 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -113,7 +113,8 @@ ENTRY(startup_32)
lgdt gdt(%ebp)
/* Enable PAE mode */
- movl $(X86_CR4_PAE), %eax
+ movl %cr4, %eax
+ orl $X86_CR4_PAE, %eax
movl %eax, %cr4
/*
@@ -178,6 +179,13 @@ ENTRY(startup_32)
*/
pushl $__KERNEL_CS
leal startup_64(%ebp), %eax
+#ifdef CONFIG_EFI_MIXED
+ movl efi32_config(%ebp), %ebx
+ cmp $0, %ebx
+ jz 1f
+ leal handover_entry(%ebp), %eax
+1:
+#endif
pushl %eax
/* Enter paged protected Mode, activating Long Mode */
@@ -188,6 +196,30 @@ ENTRY(startup_32)
lret
ENDPROC(startup_32)
+#ifdef CONFIG_EFI_MIXED
+ .org 0x190
+ENTRY(efi32_stub_entry)
+ add $0x4, %esp /* Discard return address */
+ popl %ecx
+ popl %edx
+ popl %esi
+
+ leal (BP_scratch+4)(%esi), %esp
+ call 1f
+1: pop %ebp
+ subl $1b, %ebp
+
+ movl %ecx, efi32_config(%ebp)
+ movl %edx, efi32_config+8(%ebp)
+ sgdtl efi32_boot_gdt(%ebp)
+
+ leal efi32_config(%ebp), %eax
+ movl %eax, efi_config(%ebp)
+
+ jmp startup_32
+ENDPROC(efi32_stub_entry)
+#endif
+
.code64
.org 0x200
ENTRY(startup_64)
@@ -209,26 +241,48 @@ ENTRY(startup_64)
jmp preferred_addr
ENTRY(efi_pe_entry)
- mov %rcx, %rdi
- mov %rdx, %rsi
- pushq %rdi
- pushq %rsi
+ movq %rcx, efi64_config(%rip) /* Handle */
+ movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */
+
+ leaq efi64_config(%rip), %rax
+ movq %rax, efi_config(%rip)
+
+ call 1f
+1: popq %rbp
+ subq $1b, %rbp
+
+ /*
+ * Relocate efi_config->call().
+ */
+ addq %rbp, efi64_config+88(%rip)
+
+ movq %rax, %rdi
call make_boot_params
cmpq $0,%rax
- je 1f
- mov %rax, %rdx
- popq %rsi
- popq %rdi
+ je fail
+ mov %rax, %rsi
+ jmp 2f /* Skip the relocation */
-ENTRY(efi_stub_entry)
+handover_entry:
+ call 1f
+1: popq %rbp
+ subq $1b, %rbp
+
+ /*
+ * Relocate efi_config->call().
+ */
+ movq efi_config(%rip), %rax
+ addq %rbp, 88(%rax)
+2:
+ movq efi_config(%rip), %rdi
call efi_main
movq %rax,%rsi
cmpq $0,%rax
jne 2f
-1:
+fail:
/* EFI init failed, so hang. */
hlt
- jmp 1b
+ jmp fail
2:
call 3f
3:
@@ -307,6 +361,20 @@ preferred_addr:
leaq relocated(%rbx), %rax
jmp *%rax
+#ifdef CONFIG_EFI_STUB
+ .org 0x390
+ENTRY(efi64_stub_entry)
+ movq %rdi, efi64_config(%rip) /* Handle */
+ movq %rsi, efi64_config+8(%rip) /* EFI System table pointer */
+
+ leaq efi64_config(%rip), %rax
+ movq %rax, efi_config(%rip)
+
+ movq %rdx, %rsi
+ jmp handover_entry
+ENDPROC(efi64_stub_entry)
+#endif
+
.text
relocated:
@@ -372,6 +440,25 @@ gdt:
.quad 0x0000000000000000 /* TS continued */
gdt_end:
+#ifdef CONFIG_EFI_STUB
+efi_config:
+ .quad 0
+
+#ifdef CONFIG_EFI_MIXED
+ .global efi32_config
+efi32_config:
+ .fill 11,8,0
+ .quad efi64_thunk
+ .byte 0
+#endif
+
+ .global efi64_config
+efi64_config:
+ .fill 11,8,0
+ .quad efi_call6
+ .byte 1
+#endif /* CONFIG_EFI_STUB */
+
/*
* Stack and heap for uncompression
*/
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 100a9a10076..f0d0b20fe14 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -67,6 +67,13 @@ static int is_transmeta(void)
cpu_vendor[2] == A32('M', 'x', '8', '6');
}
+static int is_intel(void)
+{
+ return cpu_vendor[0] == A32('G', 'e', 'n', 'u') &&
+ cpu_vendor[1] == A32('i', 'n', 'e', 'I') &&
+ cpu_vendor[2] == A32('n', 't', 'e', 'l');
+}
+
/* Returns a bitmask of which words we have error bits in */
static int check_cpuflags(void)
{
@@ -153,6 +160,19 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
err = check_cpuflags();
+ } else if (err == 0x01 &&
+ !(err_flags[0] & ~(1 << X86_FEATURE_PAE)) &&
+ is_intel() && cpu.level == 6 &&
+ (cpu.model == 9 || cpu.model == 13)) {
+ /* PAE is disabled on this Pentium M but can be forced */
+ if (cmdline_find_option_bool("forcepae")) {
+ puts("WARNING: Forcing PAE in CPU flags\n");
+ set_bit(X86_FEATURE_PAE, cpu.flags);
+ err = check_cpuflags();
+ }
+ else {
+ puts("WARNING: PAE disabled. Use parameter 'forcepae' to enable at your own risk!\n");
+ }
}
if (err_flags_ptr)
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index ec3b8ba6809..0ca9a5c362b 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -283,7 +283,7 @@ _start:
# Part 2 of the header, from the old setup.S
.ascii "HdrS" # header signature
- .word 0x020c # header version number (>= 0x0105)
+ .word 0x020d # header version number (>= 0x0105)
# or else old loadlin-1.5 will fail)
.globl realmode_swtch
realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
@@ -350,7 +350,7 @@ cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
# can be located anywhere in
# low memory 0x10000 or higher.
-ramdisk_max: .long 0x7fffffff
+initrd_addr_max: .long 0x7fffffff
# (Header version 0x0203 or later)
# The highest safe address for
# the contents of an initrd
@@ -375,7 +375,8 @@ xloadflags:
# define XLF0 0
#endif
-#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64)
+#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64) && \
+ !defined(CONFIG_EFI_MIXED)
/* kernel/boot_param/ramdisk could be loaded above 4g */
# define XLF1 XLF_CAN_BE_LOADED_ABOVE_4G
#else
@@ -383,10 +384,14 @@ xloadflags:
#endif
#ifdef CONFIG_EFI_STUB
-# ifdef CONFIG_X86_64
-# define XLF23 XLF_EFI_HANDOVER_64 /* 64-bit EFI handover ok */
+# ifdef CONFIG_EFI_MIXED
+# define XLF23 (XLF_EFI_HANDOVER_32|XLF_EFI_HANDOVER_64)
# else
-# define XLF23 XLF_EFI_HANDOVER_32 /* 32-bit EFI handover ok */
+# ifdef CONFIG_X86_64
+# define XLF23 XLF_EFI_HANDOVER_64 /* 64-bit EFI handover ok */
+# else
+# define XLF23 XLF_EFI_HANDOVER_32 /* 32-bit EFI handover ok */
+# endif
# endif
#else
# define XLF23 0
@@ -426,13 +431,7 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
#define INIT_SIZE VO_INIT_SIZE
#endif
init_size: .long INIT_SIZE # kernel initialization size
-handover_offset:
-#ifdef CONFIG_EFI_STUB
- .long 0x30 # offset to the handover
- # protocol entry point
-#else
- .long 0
-#endif
+handover_offset: .long 0 # Filled in by build.c
# End of setup header #####################################################
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 8e15b22391f..1a2f2121cad 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -53,7 +53,8 @@ int is_big_kernel;
#define PECOFF_RELOC_RESERVE 0x20
-unsigned long efi_stub_entry;
+unsigned long efi32_stub_entry;
+unsigned long efi64_stub_entry;
unsigned long efi_pe_entry;
unsigned long startup_64;
@@ -219,6 +220,52 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
update_pecoff_section_header(".text", text_start, text_sz);
}
+static int reserve_pecoff_reloc_section(int c)
+{
+ /* Reserve 0x20 bytes for .reloc section */
+ memset(buf+c, 0, PECOFF_RELOC_RESERVE);
+ return PECOFF_RELOC_RESERVE;
+}
+
+static void efi_stub_defaults(void)
+{
+ /* Defaults for old kernel */
+#ifdef CONFIG_X86_32
+ efi_pe_entry = 0x10;
+#else
+ efi_pe_entry = 0x210;
+ startup_64 = 0x200;
+#endif
+}
+
+static void efi_stub_entry_update(void)
+{
+ unsigned long addr = efi32_stub_entry;
+
+#ifdef CONFIG_X86_64
+ /* Yes, this is really how we defined it :( */
+ addr = efi64_stub_entry - 0x200;
+#endif
+
+#ifdef CONFIG_EFI_MIXED
+ if (efi32_stub_entry != addr)
+ die("32-bit and 64-bit EFI entry points do not match\n");
+#endif
+ put_unaligned_le32(addr, &buf[0x264]);
+}
+
+#else
+
+static inline void update_pecoff_setup_and_reloc(unsigned int size) {}
+static inline void update_pecoff_text(unsigned int text_start,
+ unsigned int file_sz) {}
+static inline void efi_stub_defaults(void) {}
+static inline void efi_stub_entry_update(void) {}
+
+static inline int reserve_pecoff_reloc_section(int c)
+{
+ return 0;
+}
#endif /* CONFIG_EFI_STUB */
@@ -250,7 +297,8 @@ static void parse_zoffset(char *fname)
p = (char *)buf;
while (p && *p) {
- PARSE_ZOFS(p, efi_stub_entry);
+ PARSE_ZOFS(p, efi32_stub_entry);
+ PARSE_ZOFS(p, efi64_stub_entry);
PARSE_ZOFS(p, efi_pe_entry);
PARSE_ZOFS(p, startup_64);
@@ -271,15 +319,7 @@ int main(int argc, char ** argv)
void *kernel;
u32 crc = 0xffffffffUL;
- /* Defaults for old kernel */
-#ifdef CONFIG_X86_32
- efi_pe_entry = 0x10;
- efi_stub_entry = 0x30;
-#else
- efi_pe_entry = 0x210;
- efi_stub_entry = 0x230;
- startup_64 = 0x200;
-#endif
+ efi_stub_defaults();
if (argc != 5)
usage();
@@ -302,11 +342,7 @@ int main(int argc, char ** argv)
die("Boot block hasn't got boot flag (0xAA55)");
fclose(file);
-#ifdef CONFIG_EFI_STUB
- /* Reserve 0x20 bytes for .reloc section */
- memset(buf+c, 0, PECOFF_RELOC_RESERVE);
- c += PECOFF_RELOC_RESERVE;
-#endif
+ c += reserve_pecoff_reloc_section(c);
/* Pad unused space with zeros */
setup_sectors = (c + 511) / 512;
@@ -315,9 +351,7 @@ int main(int argc, char ** argv)
i = setup_sectors*512;
memset(buf+c, 0, i-c);
-#ifdef CONFIG_EFI_STUB
update_pecoff_setup_and_reloc(i);
-#endif
/* Set the default root device */
put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]);
@@ -342,14 +376,9 @@ int main(int argc, char ** argv)
buf[0x1f1] = setup_sectors-1;
put_unaligned_le32(sys_size, &buf[0x1f4]);
-#ifdef CONFIG_EFI_STUB
update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz));
-#ifdef CONFIG_X86_64 /* Yes, this is really how we defined it :( */
- efi_stub_entry -= 0x200;
-#endif
- put_unaligned_le32(efi_stub_entry, &buf[0x264]);
-#endif
+ efi_stub_entry_update();
crc = partial_crc32(buf, i, crc);
if (fwrite(buf, 1, i, dest) != i)
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 7f669853317..a8fee078b92 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -5,3 +5,4 @@ genhdr-y += unistd_64.h
genhdr-y += unistd_x32.h
generic-y += clkdev.h
+generic-y += mcs_spinlock.h
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 1d2091a226b..19b0ebafcd3 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -93,9 +93,6 @@ static inline int is_vsmp_box(void)
return 0;
}
#endif
-extern void xapic_wait_icr_idle(void);
-extern u32 safe_xapic_wait_icr_idle(void);
-extern void xapic_icr_write(u32, u32);
extern int setup_profiling_timer(unsigned int);
static inline void native_apic_mem_write(u32 reg, u32 v)
@@ -184,7 +181,6 @@ extern int x2apic_phys;
extern int x2apic_preenabled;
extern void check_x2apic(void);
extern void enable_x2apic(void);
-extern void x2apic_icr_write(u32 low, u32 id);
static inline int x2apic_enabled(void)
{
u64 msr;
@@ -221,7 +217,6 @@ static inline void x2apic_force_phys(void)
{
}
-#define nox2apic 0
#define x2apic_preenabled 0
#define x2apic_supported() 0
#endif
@@ -351,7 +346,7 @@ struct apic {
int trampoline_phys_low;
int trampoline_phys_high;
- void (*wait_for_init_deassert)(atomic_t *deassert);
+ bool wait_for_init_deassert;
void (*smp_callin_clear_local_apic)(void);
void (*inquire_remote_apic)(int apicid);
@@ -517,13 +512,6 @@ extern int default_cpu_present_to_apicid(int mps_cpu);
extern int default_check_phys_apicid_present(int phys_apicid);
#endif
-static inline void default_wait_for_init_deassert(atomic_t *deassert)
-{
- while (!atomic_read(deassert))
- cpu_relax();
- return;
-}
-
extern void generic_bigsmp_probe(void);
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index e099f9502ac..bc507d7640f 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -217,9 +217,14 @@
#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */
#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */
#define X86_FEATURE_MPX (9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */
#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */
#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */
#define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_CLFLUSHOPT (9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_AVX512PF (9*32+26) /* AVX-512 Prefetch */
+#define X86_FEATURE_AVX512ER (9*32+27) /* AVX-512 Exponential and Reciprocal */
+#define X86_FEATURE_AVX512CD (9*32+28) /* AVX-512 Conflict Detection */
/*
* BUG word(s)
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index acd86c85041..0869434eaf7 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -19,9 +19,11 @@
*/
#define EFI_OLD_MEMMAP EFI_ARCH_1
+#define EFI32_LOADER_SIGNATURE "EL32"
+#define EFI64_LOADER_SIGNATURE "EL64"
+
#ifdef CONFIG_X86_32
-#define EFI_LOADER_SIGNATURE "EL32"
extern unsigned long asmlinkage efi_call_phys(void *, ...);
@@ -57,8 +59,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
#else /* !CONFIG_X86_32 */
-#define EFI_LOADER_SIGNATURE "EL64"
-
extern u64 efi_call0(void *fp);
extern u64 efi_call1(void *fp, u64 arg1);
extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
@@ -119,7 +119,6 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
#endif /* CONFIG_X86_32 */
extern int add_efi_memmap;
-extern unsigned long x86_efi_facility;
extern struct efi_scratch efi_scratch;
extern void efi_set_executable(efi_memory_desc_t *md, bool executable);
extern int efi_memblock_x86_reserve_range(void);
@@ -130,10 +129,12 @@ extern void efi_memory_uc(u64 addr, unsigned long size);
extern void __init efi_map_region(efi_memory_desc_t *md);
extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
extern void efi_sync_low_kernel_mappings(void);
-extern void efi_setup_page_tables(void);
+extern int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages);
+extern void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages);
extern void __init old_map_region(efi_memory_desc_t *md);
extern void __init runtime_code_page_mkexec(void);
extern void __init efi_runtime_mkexec(void);
+extern void __init efi_dump_pagetable(void);
extern void __init efi_apply_memmap_quirks(void);
struct efi_setup_data {
@@ -153,8 +154,40 @@ static inline bool efi_is_native(void)
return IS_ENABLED(CONFIG_X86_64) == efi_enabled(EFI_64BIT);
}
+static inline bool efi_runtime_supported(void)
+{
+ if (efi_is_native())
+ return true;
+
+ if (IS_ENABLED(CONFIG_EFI_MIXED) && !efi_enabled(EFI_OLD_MEMMAP))
+ return true;
+
+ return false;
+}
+
extern struct console early_efi_console;
extern void parse_efi_setup(u64 phys_addr, u32 data_len);
+
+#ifdef CONFIG_EFI_MIXED
+extern void efi_thunk_runtime_setup(void);
+extern efi_status_t efi_thunk_set_virtual_address_map(
+ void *phys_set_virtual_address_map,
+ unsigned long memory_map_size,
+ unsigned long descriptor_size,
+ u32 descriptor_version,
+ efi_memory_desc_t *virtual_map);
+#else
+static inline void efi_thunk_runtime_setup(void) {}
+static inline efi_status_t efi_thunk_set_virtual_address_map(
+ void *phys_set_virtual_address_map,
+ unsigned long memory_map_size,
+ unsigned long descriptor_size,
+ u32 descriptor_version,
+ efi_memory_desc_t *virtual_map)
+{
+ return EFI_SUCCESS;
+}
+#endif /* CONFIG_EFI_MIXED */
#else
/*
* IF EFI is not configured, have the EFI calls return -ENOSYS.
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index e139b13f2a3..de36f22eb0b 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -214,6 +214,8 @@ do { \
struct msr *msrs_alloc(void);
void msrs_free(struct msr *msrs);
+int msr_set_bit(u32 msr, u8 bit);
+int msr_clear_bit(u32 msr, u8 bit);
#ifdef CONFIG_SMP
int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 86f9301903c..5f2fc4441b1 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -1,6 +1,7 @@
#ifndef _ASM_X86_NMI_H
#define _ASM_X86_NMI_H
+#include <linux/irq_work.h>
#include <linux/pm.h>
#include <asm/irq.h>
#include <asm/io.h>
@@ -38,6 +39,8 @@ typedef int (*nmi_handler_t)(unsigned int, struct pt_regs *);
struct nmiaction {
struct list_head list;
nmi_handler_t handler;
+ u64 max_duration;
+ struct irq_work irq_work;
unsigned long flags;
const char *name;
};
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5ad38ad0789..b459ddf27d6 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -15,9 +15,10 @@
: (prot))
#ifndef __ASSEMBLY__
-
#include <asm/x86_init.h>
+void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+
/*
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
@@ -445,20 +446,10 @@ static inline int pte_same(pte_t a, pte_t b)
return a.pte == b.pte;
}
-static inline int pteval_present(pteval_t pteval)
-{
- /*
- * Yes Linus, _PAGE_PROTNONE == _PAGE_NUMA. Expressing it this
- * way clearly states that the intent is that protnone and numa
- * hinting ptes are considered present for the purposes of
- * pagetable operations like zapping, protection changes, gup etc.
- */
- return pteval & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_NUMA);
-}
-
static inline int pte_present(pte_t a)
{
- return pteval_present(pte_flags(a));
+ return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
+ _PAGE_NUMA);
}
#define pte_accessible pte_accessible
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 1aa9ccd4322..708f19fb4fc 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -382,9 +382,13 @@ static inline void update_page_count(int level, unsigned long pages) { }
* as a pte too.
*/
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
unsigned numpages, unsigned long page_flags);
+void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
+ unsigned numpages);
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_DEFS_H */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 645cad2c95f..e820c080a4e 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -191,6 +191,14 @@ static inline void clflush(volatile void *__p)
asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
}
+static inline void clflushopt(volatile void *__p)
+{
+ alternative_io(".byte " __stringify(NOP_DS_PREFIX) "; clflush %P0",
+ ".byte 0x66; clflush %P0",
+ X86_FEATURE_CLFLUSHOPT,
+ "+m" (*(volatile char __force *)__p));
+}
+
#define nop() asm volatile ("nop")
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index d35f24e231c..b28097e4c8c 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -119,9 +119,10 @@ static inline void setup_node_to_cpumask_map(void) { }
extern const struct cpumask *cpu_coregroup_mask(int cpu);
-#ifdef ENABLE_TOPO_DEFINES
#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
+
+#ifdef ENABLE_TOPO_DEFINES
#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
#endif
@@ -133,12 +134,6 @@ static inline void arch_fix_phys_package_id(int num, u32 slot)
struct pci_bus;
void x86_pci_root_bus_resources(int bus, struct list_head *resources);
-#ifdef CONFIG_SMP
-#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \
- (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
-#define smt_capable() (smp_num_siblings > 1)
-#endif
-
#ifdef CONFIG_NUMA
extern int get_mp_bus_to_node(int busnum);
extern void set_mp_bus_to_node(int busnum, int node);
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 554738963b2..6c1d7411eb0 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -6,11 +6,14 @@
#define XSTATE_CPUID 0x0000000d
-#define XSTATE_FP 0x1
-#define XSTATE_SSE 0x2
-#define XSTATE_YMM 0x4
-#define XSTATE_BNDREGS 0x8
-#define XSTATE_BNDCSR 0x10
+#define XSTATE_FP 0x1
+#define XSTATE_SSE 0x2
+#define XSTATE_YMM 0x4
+#define XSTATE_BNDREGS 0x8
+#define XSTATE_BNDCSR 0x10
+#define XSTATE_OPMASK 0x20
+#define XSTATE_ZMM_Hi256 0x40
+#define XSTATE_Hi16_ZMM 0x80
#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE)
@@ -23,7 +26,8 @@
#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
/* Supported features which support lazy state saving */
-#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM)
+#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
+ | XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
/* Supported features which require eager state saving */
#define XSTATE_EAGER (XSTATE_BNDREGS | XSTATE_BNDCSR)
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index c19fc60ff06..4924f4be2b9 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -368,33 +368,58 @@
#define THERM_LOG_THRESHOLD1 (1 << 9)
/* MISC_ENABLE bits: architectural */
-#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
-#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
-#define MSR_IA32_MISC_ENABLE_EMON (1ULL << 7)
-#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL (1ULL << 11)
-#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1ULL << 12)
-#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP (1ULL << 16)
-#define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << 18)
-#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << 22)
-#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << 23)
-#define MSR_IA32_MISC_ENABLE_XD_DISABLE (1ULL << 34)
+#define MSR_IA32_MISC_ENABLE_FAST_STRING_BIT 0
+#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << MSR_IA32_MISC_ENABLE_FAST_STRING_BIT)
+#define MSR_IA32_MISC_ENABLE_TCC_BIT 1
+#define MSR_IA32_MISC_ENABLE_TCC (1ULL << MSR_IA32_MISC_ENABLE_TCC_BIT)
+#define MSR_IA32_MISC_ENABLE_EMON_BIT 7
+#define MSR_IA32_MISC_ENABLE_EMON (1ULL << MSR_IA32_MISC_ENABLE_EMON_BIT)
+#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT 11
+#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL (1ULL << MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT)
+#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT 12
+#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1ULL << MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT)
+#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT 16
+#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP (1ULL << MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT)
+#define MSR_IA32_MISC_ENABLE_MWAIT_BIT 18
+#define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << MSR_IA32_MISC_ENABLE_MWAIT_BIT)
+#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT 22
+#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT);
+#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT 23
+#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT 34
+#define MSR_IA32_MISC_ENABLE_XD_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT)
/* MISC_ENABLE bits: model-specific, meaning may vary from core to core */
-#define MSR_IA32_MISC_ENABLE_X87_COMPAT (1ULL << 2)
-#define MSR_IA32_MISC_ENABLE_TM1 (1ULL << 3)
-#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE (1ULL << 4)
-#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE (1ULL << 6)
-#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK (1ULL << 8)
-#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE (1ULL << 9)
-#define MSR_IA32_MISC_ENABLE_FERR (1ULL << 10)
-#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX (1ULL << 10)
-#define MSR_IA32_MISC_ENABLE_TM2 (1ULL << 13)
-#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE (1ULL << 19)
-#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK (1ULL << 20)
-#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT (1ULL << 24)
-#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE (1ULL << 37)
-#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38)
-#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39)
+#define MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT 2
+#define MSR_IA32_MISC_ENABLE_X87_COMPAT (1ULL << MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT)
+#define MSR_IA32_MISC_ENABLE_TM1_BIT 3
+#define MSR_IA32_MISC_ENABLE_TM1 (1ULL << MSR_IA32_MISC_ENABLE_TM1_BIT)
+#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT 4
+#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT 6
+#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT 8
+#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK (1ULL << MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT)
+#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT 9
+#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_FERR_BIT 10
+#define MSR_IA32_MISC_ENABLE_FERR (1ULL << MSR_IA32_MISC_ENABLE_FERR_BIT)
+#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT 10
+#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX (1ULL << MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT)
+#define MSR_IA32_MISC_ENABLE_TM2_BIT 13
+#define MSR_IA32_MISC_ENABLE_TM2 (1ULL << MSR_IA32_MISC_ENABLE_TM2_BIT)
+#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT 19
+#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT 20
+#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK (1ULL << MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT)
+#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT 24
+#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT (1ULL << MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT)
+#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT 37
+#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT 38
+#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT 39
+#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT)
#define MSR_IA32_TSC_DEADLINE 0x000006E0
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 1dac94265b5..9f46f2b1cfc 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -613,10 +613,10 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
int nid;
nid = acpi_get_node(handle);
- if (nid == -1 || !node_online(nid))
- return;
- set_apicid_to_node(physid, nid);
- numa_set_node(cpu, nid);
+ if (nid != -1) {
+ set_apicid_to_node(physid, nid);
+ numa_set_node(cpu, nid);
+ }
#endif
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 7f26c9a70a9..53e20531470 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -133,6 +133,10 @@ static inline void imcr_apic_to_pic(void)
* +1=force-enable
*/
static int force_enable_local_apic __initdata;
+
+/* Control whether x2APIC mode is enabled or not */
+static bool nox2apic __initdata;
+
/*
* APIC command line parameters
*/
@@ -162,8 +166,7 @@ int x2apic_mode;
/* x2apic enabled before OS handover */
int x2apic_preenabled;
static int x2apic_disabled;
-static int nox2apic;
-static __init int setup_nox2apic(char *str)
+static int __init setup_nox2apic(char *str)
{
if (x2apic_enabled()) {
int apicid = native_apic_msr_read(APIC_ID);
@@ -178,7 +181,7 @@ static __init int setup_nox2apic(char *str)
} else
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
- nox2apic = 1;
+ nox2apic = true;
return 0;
}
@@ -283,8 +286,12 @@ u32 native_safe_apic_wait_icr_idle(void)
void native_apic_icr_write(u32 low, u32 id)
{
+ unsigned long flags;
+
+ local_irq_save(flags);
apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
apic_write(APIC_ICR, low);
+ local_irq_restore(flags);
}
u64 native_apic_icr_read(void)
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 2c621a6b901..7c1b2947951 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -198,7 +198,7 @@ static struct apic apic_flat = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
@@ -314,7 +314,7 @@ static struct apic apic_physflat = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 191ce75c0e5..8c7c98249c2 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -172,8 +172,7 @@ struct apic apic_noop = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
-
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 3e67f9e3d7e..a5b45df8bc8 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -248,7 +248,7 @@ static const struct apic apic_numachip __refconst = {
.wakeup_secondary_cpu = numachip_wakeup_secondary,
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL, /* REMRD not supported */
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index d50e3640d5a..e4840aa7a25 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -199,8 +199,7 @@ static struct apic apic_bigsmp = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = default_wait_for_init_deassert,
-
+ .wait_for_init_deassert = true,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index c55224731b2..6f8f8b348a3 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -394,12 +394,6 @@ static void es7000_enable_apic_mode(void)
WARN(1, "Command failed, status = %x\n", mip_status);
}
-static void es7000_wait_for_init_deassert(atomic_t *deassert)
-{
- while (!atomic_read(deassert))
- cpu_relax();
-}
-
static unsigned int es7000_get_apic_id(unsigned long x)
{
return (x >> 24) & 0xFF;
@@ -658,8 +652,7 @@ static struct apic __refdata apic_es7000_cluster = {
.trampoline_phys_low = 0x467,
.trampoline_phys_high = 0x469,
- .wait_for_init_deassert = NULL,
-
+ .wait_for_init_deassert = false,
/* Nothing to do for most platforms, since cleared by the INIT cycle: */
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
@@ -722,8 +715,7 @@ static struct apic __refdata apic_es7000 = {
.trampoline_phys_low = 0x467,
.trampoline_phys_high = 0x469,
- .wait_for_init_deassert = es7000_wait_for_init_deassert,
-
+ .wait_for_init_deassert = true,
/* Nothing to do for most platforms, since cleared by the INIT cycle: */
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 1e42e8f305e..030ea1c04f7 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -505,8 +505,7 @@ static struct apic __refdata apic_numaq = {
.trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH,
/* We don't do anything here because we use NMI's to boot instead */
- .wait_for_init_deassert = NULL,
-
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = numaq_smp_callin_clear_local_apic,
.inquire_remote_apic = NULL,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index eb35ef9ee63..cceb352c968 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -119,8 +119,7 @@ static struct apic apic_default = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = default_wait_for_init_deassert,
-
+ .wait_for_init_deassert = true,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 00146f9b025..b656128611c 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -532,8 +532,7 @@ static struct apic apic_summit = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = default_wait_for_init_deassert,
-
+ .wait_for_init_deassert = true,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index cac85ee6913..e66766bf164 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -279,7 +279,7 @@ static struct apic apic_x2apic_cluster = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index de231e328ca..6d600ebf6c1 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -133,7 +133,7 @@ static struct apic apic_x2apic_phys = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d263b1307de..7834389ba5b 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -396,7 +396,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
.wakeup_secondary_cpu = uv_wakeup_secondary,
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c67ffa68606..ce8b8ff0e0e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -218,7 +218,7 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c)
*/
WARN_ONCE(1, "WARNING: This combination of AMD"
" processors is not suitable for SMP.\n");
- add_taint(TAINT_UNSAFE_SMP, LOCKDEP_NOW_UNRELIABLE);
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
}
static void init_amd_k7(struct cpuinfo_x86 *c)
@@ -233,9 +233,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
if (c->x86_model >= 6 && c->x86_model <= 10) {
if (!cpu_has(c, X86_FEATURE_XMM)) {
printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
- rdmsr(MSR_K7_HWCR, l, h);
- l &= ~0x00008000;
- wrmsr(MSR_K7_HWCR, l, h);
+ msr_clear_bit(MSR_K7_HWCR, 15);
set_cpu_cap(c, X86_FEATURE_XMM);
}
}
@@ -509,14 +507,8 @@ static void early_init_amd(struct cpuinfo_x86 *c)
#endif
/* F16h erratum 793, CVE-2013-6885 */
- if (c->x86 == 0x16 && c->x86_model <= 0xf) {
- u64 val;
-
- rdmsrl(MSR_AMD64_LS_CFG, val);
- if (!(val & BIT(15)))
- wrmsrl(MSR_AMD64_LS_CFG, val | BIT(15));
- }
-
+ if (c->x86 == 0x16 && c->x86_model <= 0xf)
+ msr_set_bit(MSR_AMD64_LS_CFG, 15);
}
static const int amd_erratum_383[];
@@ -536,11 +528,8 @@ static void init_amd(struct cpuinfo_x86 *c)
* Errata 63 for SH-B3 steppings
* Errata 122 for all steppings (F+ have it disabled by default)
*/
- if (c->x86 == 0xf) {
- rdmsrl(MSR_K7_HWCR, value);
- value |= 1 << 6;
- wrmsrl(MSR_K7_HWCR, value);
- }
+ if (c->x86 == 0xf)
+ msr_set_bit(MSR_K7_HWCR, 6);
#endif
early_init_amd(c);
@@ -623,14 +612,11 @@ static void init_amd(struct cpuinfo_x86 *c)
(c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
!cpu_has(c, X86_FEATURE_TOPOEXT)) {
- if (!rdmsrl_safe(0xc0011005, &value)) {
- value |= 1ULL << 54;
- wrmsrl_safe(0xc0011005, value);
+ if (msr_set_bit(0xc0011005, 54) > 0) {
rdmsrl(0xc0011005, value);
- if (value & (1ULL << 54)) {
+ if (value & BIT_64(54)) {
set_cpu_cap(c, X86_FEATURE_TOPOEXT);
- printk(KERN_INFO FW_INFO "CPU: Re-enabling "
- "disabled Topology Extensions Support\n");
+ pr_info(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
}
}
}
@@ -709,19 +695,12 @@ static void init_amd(struct cpuinfo_x86 *c)
* Disable GART TLB Walk Errors on Fam10h. We do this here
* because this is always needed when GART is enabled, even in a
* kernel which has no MCE support built in.
- * BIOS should disable GartTlbWlk Errors themself. If
- * it doesn't do it here as suggested by the BKDG.
+ * BIOS should disable GartTlbWlk Errors already. If
+ * it doesn't, do it here as suggested by the BKDG.
*
* Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
*/
- u64 mask;
- int err;
-
- err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
- if (err == 0) {
- mask |= (1 << 10);
- wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask);
- }
+ msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
/*
* On family 10h BIOS may not have properly enabled WC+ support,
@@ -733,10 +712,7 @@ static void init_amd(struct cpuinfo_x86 *c)
* NOTE: we want to use the _safe accessors so as not to #GP kvm
* guests on older kvm hosts.
*/
-
- rdmsrl_safe(MSR_AMD64_BU_CFG2, &value);
- value &= ~(1ULL << 24);
- wrmsrl_safe(MSR_AMD64_BU_CFG2, value);
+ msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
if (cpu_has_amd_erratum(c, amd_erratum_383))
set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 5cd9bfabd64..897d6201ef1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -31,11 +31,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
/* Unmask CPUID levels if masked: */
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
- rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
-
- if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
- misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
- wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) {
c->cpuid_level = cpuid_eax(0);
get_cpu_cap(c);
}
@@ -129,16 +126,10 @@ static void early_init_intel(struct cpuinfo_x86 *c)
* Ingo Molnar reported a Pentium D (model 6) and a Xeon
* (model 2) with the same problem.
*/
- if (c->x86 == 15) {
- rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
-
- if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
- printk(KERN_INFO "kmemcheck: Disabling fast string operations\n");
-
- misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
- wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
- }
- }
+ if (c->x86 == 15)
+ if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0)
+ pr_info("kmemcheck: Disabling fast string operations\n");
#endif
/*
@@ -195,10 +186,16 @@ static void intel_smp_check(struct cpuinfo_x86 *c)
}
}
-static void intel_workarounds(struct cpuinfo_x86 *c)
+static int forcepae;
+static int __init forcepae_setup(char *__unused)
{
- unsigned long lo, hi;
+ forcepae = 1;
+ return 1;
+}
+__setup("forcepae", forcepae_setup);
+static void intel_workarounds(struct cpuinfo_x86 *c)
+{
#ifdef CONFIG_X86_F00F_BUG
/*
* All current models of Pentium and Pentium with MMX technology CPUs
@@ -225,16 +222,26 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
clear_cpu_cap(c, X86_FEATURE_SEP);
/*
+ * PAE CPUID issue: many Pentium M report no PAE but may have a
+ * functionally usable PAE implementation.
+ * Forcefully enable PAE if kernel parameter "forcepae" is present.
+ */
+ if (forcepae) {
+ printk(KERN_WARNING "PAE forced!\n");
+ set_cpu_cap(c, X86_FEATURE_PAE);
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
+ }
+
+ /*
* P4 Xeon errata 037 workaround.
* Hardware prefetcher may cause stale data to be loaded into the cache.
*/
if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
- rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
- if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) {
- printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
- printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
- lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
- wrmsr(MSR_IA32_MISC_ENABLE, lo, hi);
+ if (msr_set_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT)
+ > 0) {
+ pr_info("CPU: C0 stepping P4 Xeon detected.\n");
+ pr_info("CPU: Disabling hardware prefetching (Errata 037)\n");
}
}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 79f9f848bee..ae407f7226c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -892,7 +892,6 @@ static void x86_pmu_enable(struct pmu *pmu)
* hw_perf_group_sched_in() or x86_pmu_enable()
*
* step1: save events moving to new counters
- * step2: reprogram moved events into new counters
*/
for (i = 0; i < n_running; i++) {
event = cpuc->event_list[i];
@@ -918,6 +917,9 @@ static void x86_pmu_enable(struct pmu *pmu)
x86_pmu_stop(event, PERF_EF_UPDATE);
}
+ /*
+ * step2: reprogram moved events into new counters
+ */
for (i = 0; i < cpuc->n_events; i++) {
event = cpuc->event_list[i];
hwc = &event->hw;
@@ -1043,7 +1045,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)
/*
* If group events scheduling transaction was started,
* skip the schedulability test here, it will be performed
- * at commit time (->commit_txn) as a whole
+ * at commit time (->commit_txn) as a whole.
*/
if (cpuc->group_flag & PERF_EVENT_TXN)
goto done_collect;
@@ -1058,6 +1060,10 @@ static int x86_pmu_add(struct perf_event *event, int flags)
memcpy(cpuc->assign, assign, n*sizeof(int));
done_collect:
+ /*
+ * Commit the collect_events() state. See x86_pmu_del() and
+ * x86_pmu_*_txn().
+ */
cpuc->n_events = n;
cpuc->n_added += n - n0;
cpuc->n_txn += n - n0;
@@ -1183,28 +1189,38 @@ static void x86_pmu_del(struct perf_event *event, int flags)
* If we're called during a txn, we don't need to do anything.
* The events never got scheduled and ->cancel_txn will truncate
* the event_list.
+ *
+ * XXX assumes any ->del() called during a TXN will only be on
+ * an event added during that same TXN.
*/
if (cpuc->group_flag & PERF_EVENT_TXN)
return;
+ /*
+ * Not a TXN, therefore cleanup properly.
+ */
x86_pmu_stop(event, PERF_EF_UPDATE);
for (i = 0; i < cpuc->n_events; i++) {
- if (event == cpuc->event_list[i]) {
+ if (event == cpuc->event_list[i])
+ break;
+ }
- if (i >= cpuc->n_events - cpuc->n_added)
- --cpuc->n_added;
+ if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
+ return;
- if (x86_pmu.put_event_constraints)
- x86_pmu.put_event_constraints(cpuc, event);
+ /* If we have a newly added event; make sure to decrease n_added. */
+ if (i >= cpuc->n_events - cpuc->n_added)
+ --cpuc->n_added;
- while (++i < cpuc->n_events)
- cpuc->event_list[i-1] = cpuc->event_list[i];
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(cpuc, event);
+
+ /* Delete the array entry. */
+ while (++i < cpuc->n_events)
+ cpuc->event_list[i-1] = cpuc->event_list[i];
+ --cpuc->n_events;
- --cpuc->n_events;
- break;
- }
- }
perf_event_update_userpage(event);
}
@@ -1598,7 +1614,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)
{
__this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
/*
- * Truncate the collected events.
+ * Truncate collected array by the number of events added in this
+ * transaction. See x86_pmu_add() and x86_pmu_*_txn().
*/
__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
@@ -1609,6 +1626,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)
* Commit group events scheduling transaction
* Perform the group schedulability test as a whole
* Return 0 if success
+ *
+ * Does not cancel the transaction on failure; expects the caller to do this.
*/
static int x86_pmu_commit_txn(struct pmu *pmu)
{
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 4972c244d0b..3b2f9bdd974 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -130,9 +130,11 @@ struct cpu_hw_events {
unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
int enabled;
- int n_events;
- int n_added;
- int n_txn;
+ int n_events; /* the # of events in the below arrays */
+ int n_added; /* the # last events in the below arrays;
+ they've never been enabled yet */
+ int n_txn; /* the # last events in the below arrays;
+ added in the current transaction */
int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
u64 tags[X86_PMC_IDX_MAX];
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 047f540cf3f..bd2253d40cf 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -66,6 +66,47 @@ DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
+static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
+static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
+static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
+static void uncore_pmu_event_read(struct perf_event *event);
+
+static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+{
+ return container_of(event->pmu, struct intel_uncore_pmu, pmu);
+}
+
+static struct intel_uncore_box *
+uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
+{
+ struct intel_uncore_box *box;
+
+ box = *per_cpu_ptr(pmu->box, cpu);
+ if (box)
+ return box;
+
+ raw_spin_lock(&uncore_box_lock);
+ list_for_each_entry(box, &pmu->box_list, list) {
+ if (box->phys_id == topology_physical_package_id(cpu)) {
+ atomic_inc(&box->refcnt);
+ *per_cpu_ptr(pmu->box, cpu) = box;
+ break;
+ }
+ }
+ raw_spin_unlock(&uncore_box_lock);
+
+ return *per_cpu_ptr(pmu->box, cpu);
+}
+
+static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+{
+ /*
+ * perf core schedules event on the basis of cpu, uncore events are
+ * collected by one of the cpus inside a physical package.
+ */
+ return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
+}
+
static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
{
u64 count;
@@ -1639,6 +1680,349 @@ static struct intel_uncore_type *snb_msr_uncores[] = {
&snb_uncore_cbox,
NULL,
};
+
+enum {
+ SNB_PCI_UNCORE_IMC,
+};
+
+static struct uncore_event_desc snb_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"),
+ INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
+
+ INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
+ INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
+
+ { /* end: all zeroes */ },
+};
+
+#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff
+#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48
+
+/* page size multiple covering all config regs */
+#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000
+
+#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1
+#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054
+#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE
+
+static struct attribute *snb_uncore_imc_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group snb_uncore_imc_format_group = {
+ .name = "format",
+ .attrs = snb_uncore_imc_formats_attr,
+};
+
+static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
+ resource_size_t addr;
+ u32 pci_dword;
+
+ pci_read_config_dword(pdev, where, &pci_dword);
+ addr = pci_dword;
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ pci_read_config_dword(pdev, where + 4, &pci_dword);
+ addr |= ((resource_size_t)pci_dword << 32);
+#endif
+
+ addr &= ~(PAGE_SIZE - 1);
+
+ box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+ box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
+}
+
+static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
+}
+
+/*
+ * custom event_init() function because we define our own fixed, free
+ * running counters, so we do not want to conflict with generic uncore
+ * logic. Also simplifies processing
+ */
+static int snb_uncore_imc_event_init(struct perf_event *event)
+{
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
+ int idx, base;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ pmu = uncore_event_to_pmu(event);
+ /* no device found for this pmu */
+ if (pmu->func_id < 0)
+ return -ENOENT;
+
+ /* Sampling not supported yet */
+ if (hwc->sample_period)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ /*
+ * Place all uncore events for a particular physical package
+ * onto a single cpu
+ */
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ /* check only supported bits are set */
+ if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
+ return -EINVAL;
+
+ box = uncore_pmu_to_box(pmu, event->cpu);
+ if (!box || box->cpu < 0)
+ return -EINVAL;
+
+ event->cpu = box->cpu;
+
+ event->hw.idx = -1;
+ event->hw.last_tag = ~0ULL;
+ event->hw.extra_reg.idx = EXTRA_REG_NONE;
+ event->hw.branch_reg.idx = EXTRA_REG_NONE;
+ /*
+ * check event is known (whitelist, determines counter)
+ */
+ switch (cfg) {
+ case SNB_UNCORE_PCI_IMC_DATA_READS:
+ base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
+ idx = UNCORE_PMC_IDX_FIXED;
+ break;
+ case SNB_UNCORE_PCI_IMC_DATA_WRITES:
+ base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
+ idx = UNCORE_PMC_IDX_FIXED + 1;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* must be done before validate_group */
+ event->hw.event_base = base;
+ event->hw.config = cfg;
+ event->hw.idx = idx;
+
+ /* no group validation needed, we have free running counters */
+
+ return 0;
+}
+
+static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return 0;
+}
+
+static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ u64 count;
+
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+ box->n_active++;
+
+ list_add_tail(&event->active_entry, &box->active_list);
+
+ count = snb_uncore_imc_read_counter(box, event);
+ local64_set(&event->hw.prev_count, count);
+
+ if (box->n_active == 1)
+ uncore_pmu_start_hrtimer(box);
+}
+
+static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!(hwc->state & PERF_HES_STOPPED)) {
+ box->n_active--;
+
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+
+ list_del(&event->active_entry);
+
+ if (box->n_active == 0)
+ uncore_pmu_cancel_hrtimer(box);
+ }
+
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ uncore_perf_event_update(box, event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!box)
+ return -ENODEV;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ if (!(flags & PERF_EF_START))
+ hwc->state |= PERF_HES_ARCH;
+
+ snb_uncore_imc_event_start(event, 0);
+
+ box->n_events++;
+
+ return 0;
+}
+
+static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ int i;
+
+ snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
+
+ for (i = 0; i < box->n_events; i++) {
+ if (event == box->event_list[i]) {
+ --box->n_events;
+ break;
+ }
+ }
+}
+
+static int snb_pci2phy_map_init(int devid)
+{
+ struct pci_dev *dev = NULL;
+ int bus;
+
+ dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
+ if (!dev)
+ return -ENOTTY;
+
+ bus = dev->bus->number;
+
+ pcibus_to_physid[bus] = 0;
+
+ pci_dev_put(dev);
+
+ return 0;
+}
+
+static struct pmu snb_uncore_imc_pmu = {
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = snb_uncore_imc_event_init,
+ .add = snb_uncore_imc_event_add,
+ .del = snb_uncore_imc_event_del,
+ .start = snb_uncore_imc_event_start,
+ .stop = snb_uncore_imc_event_stop,
+ .read = uncore_pmu_event_read,
+};
+
+static struct intel_uncore_ops snb_uncore_imc_ops = {
+ .init_box = snb_uncore_imc_init_box,
+ .enable_box = snb_uncore_imc_enable_box,
+ .disable_box = snb_uncore_imc_disable_box,
+ .disable_event = snb_uncore_imc_disable_event,
+ .enable_event = snb_uncore_imc_enable_event,
+ .hw_config = snb_uncore_imc_hw_config,
+ .read_counter = snb_uncore_imc_read_counter,
+};
+
+static struct intel_uncore_type snb_uncore_imc = {
+ .name = "imc",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .fixed_ctr_bits = 32,
+ .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE,
+ .event_descs = snb_uncore_imc_events,
+ .format_group = &snb_uncore_imc_format_group,
+ .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
+ .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK,
+ .ops = &snb_uncore_imc_ops,
+ .pmu = &snb_uncore_imc_pmu,
+};
+
+static struct intel_uncore_type *snb_pci_uncores[] = {
+ [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc,
+ NULL,
+};
+
+static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static DEFINE_PCI_DEVICE_TABLE(hsw_uncore_pci_ids) = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static struct pci_driver snb_uncore_pci_driver = {
+ .name = "snb_uncore",
+ .id_table = snb_uncore_pci_ids,
+};
+
+static struct pci_driver ivb_uncore_pci_driver = {
+ .name = "ivb_uncore",
+ .id_table = ivb_uncore_pci_ids,
+};
+
+static struct pci_driver hsw_uncore_pci_driver = {
+ .name = "hsw_uncore",
+ .id_table = hsw_uncore_pci_ids,
+};
+
/* end of Sandy Bridge uncore support */
/* Nehalem uncore support */
@@ -2789,6 +3173,7 @@ again:
static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
{
struct intel_uncore_box *box;
+ struct perf_event *event;
unsigned long flags;
int bit;
@@ -2801,19 +3186,27 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
*/
local_irq_save(flags);
+ /*
+ * handle boxes with an active event list as opposed to active
+ * counters
+ */
+ list_for_each_entry(event, &box->active_list, active_entry) {
+ uncore_perf_event_update(box, event);
+ }
+
for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
uncore_perf_event_update(box, box->events[bit]);
local_irq_restore(flags);
- hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL));
+ hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
return HRTIMER_RESTART;
}
static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
{
__hrtimer_start_range_ns(&box->hrtimer,
- ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0,
+ ns_to_ktime(box->hrtimer_duration), 0,
HRTIMER_MODE_REL_PINNED, 0);
}
@@ -2847,43 +3240,12 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
box->cpu = -1;
box->phys_id = -1;
- return box;
-}
+ /* set default hrtimer timeout */
+ box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
-static struct intel_uncore_box *
-uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
-{
- struct intel_uncore_box *box;
+ INIT_LIST_HEAD(&box->active_list);
- box = *per_cpu_ptr(pmu->box, cpu);
- if (box)
- return box;
-
- raw_spin_lock(&uncore_box_lock);
- list_for_each_entry(box, &pmu->box_list, list) {
- if (box->phys_id == topology_physical_package_id(cpu)) {
- atomic_inc(&box->refcnt);
- *per_cpu_ptr(pmu->box, cpu) = box;
- break;
- }
- }
- raw_spin_unlock(&uncore_box_lock);
-
- return *per_cpu_ptr(pmu->box, cpu);
-}
-
-static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
-{
- return container_of(event->pmu, struct intel_uncore_pmu, pmu);
-}
-
-static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
-{
- /*
- * perf core schedules event on the basis of cpu, uncore events are
- * collected by one of the cpus inside a physical package.
- */
- return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
+ return box;
}
static int
@@ -3279,16 +3641,21 @@ static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
{
int ret;
- pmu->pmu = (struct pmu) {
- .attr_groups = pmu->type->attr_groups,
- .task_ctx_nr = perf_invalid_context,
- .event_init = uncore_pmu_event_init,
- .add = uncore_pmu_event_add,
- .del = uncore_pmu_event_del,
- .start = uncore_pmu_event_start,
- .stop = uncore_pmu_event_stop,
- .read = uncore_pmu_event_read,
- };
+ if (!pmu->type->pmu) {
+ pmu->pmu = (struct pmu) {
+ .attr_groups = pmu->type->attr_groups,
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = uncore_pmu_event_init,
+ .add = uncore_pmu_event_add,
+ .del = uncore_pmu_event_del,
+ .start = uncore_pmu_event_start,
+ .stop = uncore_pmu_event_stop,
+ .read = uncore_pmu_event_read,
+ };
+ } else {
+ pmu->pmu = *pmu->type->pmu;
+ pmu->pmu.attr_groups = pmu->type->attr_groups;
+ }
if (pmu->type->num_boxes == 1) {
if (strlen(pmu->type->name) > 0)
@@ -3502,6 +3869,28 @@ static int __init uncore_pci_init(void)
pci_uncores = ivt_pci_uncores;
uncore_pci_driver = &ivt_uncore_pci_driver;
break;
+ case 42: /* Sandy Bridge */
+ ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC);
+ if (ret)
+ return ret;
+ pci_uncores = snb_pci_uncores;
+ uncore_pci_driver = &snb_uncore_pci_driver;
+ break;
+ case 58: /* Ivy Bridge */
+ ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC);
+ if (ret)
+ return ret;
+ pci_uncores = snb_pci_uncores;
+ uncore_pci_driver = &ivb_uncore_pci_driver;
+ break;
+ case 60: /* Haswell */
+ case 69: /* Haswell Celeron */
+ ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC);
+ if (ret)
+ return ret;
+ pci_uncores = snb_pci_uncores;
+ uncore_pci_driver = &hsw_uncore_pci_driver;
+ break;
default:
return 0;
}
@@ -3773,7 +4162,7 @@ static void __init uncore_cpu_setup(void *dummy)
static int __init uncore_cpu_init(void)
{
- int ret, cpu, max_cores;
+ int ret, max_cores;
max_cores = boot_cpu_data.x86_max_cores;
switch (boot_cpu_data.x86_model) {
@@ -3817,29 +4206,6 @@ static int __init uncore_cpu_init(void)
if (ret)
return ret;
- get_online_cpus();
-
- for_each_online_cpu(cpu) {
- int i, phys_id = topology_physical_package_id(cpu);
-
- for_each_cpu(i, &uncore_cpu_mask) {
- if (phys_id == topology_physical_package_id(i)) {
- phys_id = -1;
- break;
- }
- }
- if (phys_id < 0)
- continue;
-
- uncore_cpu_prepare(cpu, phys_id);
- uncore_event_init_cpu(cpu);
- }
- on_each_cpu(uncore_cpu_setup, NULL, 1);
-
- register_cpu_notifier(&uncore_cpu_nb);
-
- put_online_cpus();
-
return 0;
}
@@ -3868,6 +4234,41 @@ static int __init uncore_pmus_register(void)
return 0;
}
+static void __init uncore_cpumask_init(void)
+{
+ int cpu;
+
+ /*
+ * ony invoke once from msr or pci init code
+ */
+ if (!cpumask_empty(&uncore_cpu_mask))
+ return;
+
+ get_online_cpus();
+
+ for_each_online_cpu(cpu) {
+ int i, phys_id = topology_physical_package_id(cpu);
+
+ for_each_cpu(i, &uncore_cpu_mask) {
+ if (phys_id == topology_physical_package_id(i)) {
+ phys_id = -1;
+ break;
+ }
+ }
+ if (phys_id < 0)
+ continue;
+
+ uncore_cpu_prepare(cpu, phys_id);
+ uncore_event_init_cpu(cpu);
+ }
+ on_each_cpu(uncore_cpu_setup, NULL, 1);
+
+ register_cpu_notifier(&uncore_cpu_nb);
+
+ put_online_cpus();
+}
+
+
static int __init intel_uncore_init(void)
{
int ret;
@@ -3886,6 +4287,7 @@ static int __init intel_uncore_init(void)
uncore_pci_exit();
goto fail;
}
+ uncore_cpumask_init();
uncore_pmus_register();
return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index a80ab71a883..90236f0c94a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -6,6 +6,7 @@
#define UNCORE_PMU_NAME_LEN 32
#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC)
+#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
#define UNCORE_FIXED_EVENT 0xff
#define UNCORE_PMC_IDX_MAX_GENERIC 8
@@ -440,6 +441,7 @@ struct intel_uncore_type {
struct intel_uncore_ops *ops;
struct uncore_event_desc *event_descs;
const struct attribute_group *attr_groups[4];
+ struct pmu *pmu; /* for custom pmu ops */
};
#define pmu_group attr_groups[0]
@@ -488,8 +490,11 @@ struct intel_uncore_box {
u64 tags[UNCORE_PMC_IDX_MAX];
struct pci_dev *pci_dev;
struct intel_uncore_pmu *pmu;
+ u64 hrtimer_duration; /* hrtimer timeout for this box */
struct hrtimer hrtimer;
struct list_head list;
+ struct list_head active_list;
+ void *io_addr;
struct intel_uncore_extra_reg shared_regs[0];
};
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 3486e666035..5d466b7d860 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1257,7 +1257,24 @@ again:
pass++;
goto again;
}
-
+ /*
+ * Perf does test runs to see if a whole group can be assigned
+ * together succesfully. There can be multiple rounds of this.
+ * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
+ * bits, such that the next round of group assignments will
+ * cause the above p4_should_swap_ts to pass instead of fail.
+ * This leads to counters exclusive to thread0 being used by
+ * thread1.
+ *
+ * Solve this with a cheap hack, reset the idx back to -1 to
+ * force a new lookup (p4_next_cntr) to get the right counter
+ * for the right thread.
+ *
+ * This probably doesn't comply with the general spirit of how
+ * perf wants to work, but P4 is special. :-(
+ */
+ if (p4_should_swap_ts(hwc->config, cpu))
+ hwc->idx = -1;
p4_pmu_swap_config_ts(hwc, cpu);
if (assign)
assign[i] = cntr_idx;
@@ -1322,6 +1339,7 @@ static __initconst const struct x86_pmu p4_pmu = {
__init int p4_pmu_init(void)
{
unsigned int low, high;
+ int i, reg;
/* If we get stripped -- indexing fails */
BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
@@ -1340,5 +1358,19 @@ __init int p4_pmu_init(void)
x86_pmu = p4_pmu;
+ /*
+ * Even though the counters are configured to interrupt a particular
+ * logical processor when an overflow happens, testing has shown that
+ * on kdump kernels (which uses a single cpu), thread1's counter
+ * continues to run and will report an NMI on thread0. Due to the
+ * overflow bug, this leads to a stream of unknown NMIs.
+ *
+ * Solve this by zero'ing out the registers to mimic a reset.
+ */
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ reg = x86_pmu_config_addr(i);
+ wrmsrl_safe(reg, 0ULL);
+ }
+
return 0;
}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index a57902efe2d..507de806659 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -57,9 +57,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
struct pt_regs fixed_regs;
-#endif
-#ifdef CONFIG_X86_32
if (!user_mode_vm(regs)) {
crash_fixup_ss_esp(&fixed_regs, regs);
regs = &fixed_regs;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index f2a1770ca17..a21d49c071d 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -30,7 +30,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long dummy;
stack = &dummy;
- if (task && task != current)
+ if (task != current)
stack = (unsigned long *)task->thread.sp;
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 6fcb49ce50a..b4872b999a7 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -87,6 +87,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
#define nmi_to_desc(type) (&nmi_desc[type])
static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
+
static int __init nmi_warning_debugfs(void)
{
debugfs_create_u64("nmi_longest_ns", 0644,
@@ -95,6 +96,20 @@ static int __init nmi_warning_debugfs(void)
}
fs_initcall(nmi_warning_debugfs);
+static void nmi_max_handler(struct irq_work *w)
+{
+ struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
+ int remainder_ns, decimal_msecs;
+ u64 whole_msecs = ACCESS_ONCE(a->max_duration);
+
+ remainder_ns = do_div(whole_msecs, (1000 * 1000));
+ decimal_msecs = remainder_ns / 1000;
+
+ printk_ratelimited(KERN_INFO
+ "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
+ a->handler, whole_msecs, decimal_msecs);
+}
+
static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
{
struct nmi_desc *desc = nmi_to_desc(type);
@@ -110,26 +125,20 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2
* to handle those situations.
*/
list_for_each_entry_rcu(a, &desc->head, list) {
- u64 before, delta, whole_msecs;
- int remainder_ns, decimal_msecs, thishandled;
+ int thishandled;
+ u64 delta;
- before = sched_clock();
+ delta = sched_clock();
thishandled = a->handler(type, regs);
handled += thishandled;
- delta = sched_clock() - before;
+ delta = sched_clock() - delta;
trace_nmi_handler(a->handler, (int)delta, thishandled);
- if (delta < nmi_longest_ns)
+ if (delta < nmi_longest_ns || delta < a->max_duration)
continue;
- nmi_longest_ns = delta;
- whole_msecs = delta;
- remainder_ns = do_div(whole_msecs, (1000 * 1000));
- decimal_msecs = remainder_ns / 1000;
- printk_ratelimited(KERN_INFO
- "INFO: NMI handler (%ps) took too long to run: "
- "%lld.%03d msecs\n", a->handler, whole_msecs,
- decimal_msecs);
+ a->max_duration = delta;
+ irq_work_queue(&a->irq_work);
}
rcu_read_unlock();
@@ -146,6 +155,8 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
if (!action->handler)
return -EINVAL;
+ init_irq_work(&action->irq_work, nmi_max_handler);
+
spin_lock_irqsave(&desc->lock, flags);
/*
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3fb8d95ab8b..4505e2a950d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -298,10 +298,7 @@ void arch_cpu_idle_dead(void)
*/
void arch_cpu_idle(void)
{
- if (cpuidle_idle_call())
- x86_idle();
- else
- local_irq_enable();
+ x86_idle();
}
/*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ce72964b2f4..fa511acff7e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -926,11 +926,11 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_EFI
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
"EL32", 4)) {
- set_bit(EFI_BOOT, &x86_efi_facility);
+ set_bit(EFI_BOOT, &efi.flags);
} else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
"EL64", 4)) {
- set_bit(EFI_BOOT, &x86_efi_facility);
- set_bit(EFI_64BIT, &x86_efi_facility);
+ set_bit(EFI_BOOT, &efi.flags);
+ set_bit(EFI_64BIT, &efi.flags);
}
if (efi_enabled(EFI_BOOT))
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index a32da804252..60179ec39d4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -122,8 +122,9 @@ static void smp_callin(void)
* Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI.
*/
cpuid = smp_processor_id();
- if (apic->wait_for_init_deassert && cpuid != 0)
- apic->wait_for_init_deassert(&init_deasserted);
+ if (apic->wait_for_init_deassert && cpuid)
+ while (!atomic_read(&init_deasserted))
+ cpu_relax();
/*
* (This works even if the APIC is not enabled.)
@@ -701,11 +702,15 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
int id;
int boot_error;
+ preempt_disable();
+
/*
* Wake up AP by INIT, INIT, STARTUP sequence.
*/
- if (cpu)
- return wakeup_secondary_cpu_via_init(apicid, start_ip);
+ if (cpu) {
+ boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
+ goto out;
+ }
/*
* Wake up BSP by nmi.
@@ -725,6 +730,9 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
}
+out:
+ preempt_enable();
+
return boot_error;
}
diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c
index 3056702e81f..ff4fa51a5b1 100644
--- a/arch/x86/lib/hash.c
+++ b/arch/x86/lib/hash.c
@@ -32,6 +32,7 @@
*/
#include <linux/hash.h>
+#include <linux/init.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>
@@ -39,7 +40,11 @@
static inline u32 crc32_u32(u32 crc, u32 val)
{
+#ifdef CONFIG_AS_CRC32
asm ("crc32l %1,%0\n" : "+r" (crc) : "rm" (val));
+#else
+ asm (".byte 0xf2, 0x0f, 0x38, 0xf1, 0xc1" : "+a" (crc) : "c" (val));
+#endif
return crc;
}
@@ -49,19 +54,18 @@ static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed)
u32 i, tmp = 0;
for (i = 0; i < len / 4; i++)
- seed = crc32_u32(*p32++, seed);
+ seed = crc32_u32(seed, *p32++);
- switch (3 - (len & 0x03)) {
- case 0:
+ switch (len & 3) {
+ case 3:
tmp |= *((const u8 *) p32 + 2) << 16;
/* fallthrough */
- case 1:
+ case 2:
tmp |= *((const u8 *) p32 + 1) << 8;
/* fallthrough */
- case 2:
+ case 1:
tmp |= *((const u8 *) p32);
- seed = crc32_u32(tmp, seed);
- default:
+ seed = crc32_u32(seed, tmp);
break;
}
@@ -74,12 +78,12 @@ static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed)
u32 i;
for (i = 0; i < len; i++)
- seed = crc32_u32(*p32++, seed);
+ seed = crc32_u32(seed, *p32++);
return seed;
}
-void setup_arch_fast_hash(struct fast_hash_ops *ops)
+void __init setup_arch_fast_hash(struct fast_hash_ops *ops)
{
if (cpu_has_xmm4_2) {
ops->hash = intel_crc4_2_hash;
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 8f8eebdca7d..db9db446b71 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -8,7 +8,7 @@ struct msr *msrs_alloc(void)
msrs = alloc_percpu(struct msr);
if (!msrs) {
- pr_warning("%s: error allocating msrs\n", __func__);
+ pr_warn("%s: error allocating msrs\n", __func__);
return NULL;
}
@@ -21,3 +21,90 @@ void msrs_free(struct msr *msrs)
free_percpu(msrs);
}
EXPORT_SYMBOL(msrs_free);
+
+/**
+ * Read an MSR with error handling
+ *
+ * @msr: MSR to read
+ * @m: value to read into
+ *
+ * It returns read data only on success, otherwise it doesn't change the output
+ * argument @m.
+ *
+ */
+int msr_read(u32 msr, struct msr *m)
+{
+ int err;
+ u64 val;
+
+ err = rdmsrl_safe(msr, &val);
+ if (!err)
+ m->q = val;
+
+ return err;
+}
+
+/**
+ * Write an MSR with error handling
+ *
+ * @msr: MSR to write
+ * @m: value to write
+ */
+int msr_write(u32 msr, struct msr *m)
+{
+ return wrmsrl_safe(msr, m->q);
+}
+
+static inline int __flip_bit(u32 msr, u8 bit, bool set)
+{
+ struct msr m, m1;
+ int err = -EINVAL;
+
+ if (bit > 63)
+ return err;
+
+ err = msr_read(msr, &m);
+ if (err)
+ return err;
+
+ m1 = m;
+ if (set)
+ m1.q |= BIT_64(bit);
+ else
+ m1.q &= ~BIT_64(bit);
+
+ if (m1.q == m.q)
+ return 0;
+
+ err = msr_write(msr, &m);
+ if (err)
+ return err;
+
+ return 1;
+}
+
+/**
+ * Set @bit in a MSR @msr.
+ *
+ * Retval:
+ * < 0: An error was encountered.
+ * = 0: Bit was already set.
+ * > 0: Hardware accepted the MSR write.
+ */
+int msr_set_bit(u32 msr, u8 bit)
+{
+ return __flip_bit(msr, bit, true);
+}
+
+/**
+ * Clear @bit in a MSR @msr.
+ *
+ * Retval:
+ * < 0: An error was encountered.
+ * = 0: Bit was already cleared.
+ * > 0: Hardware accepted the MSR write.
+ */
+int msr_clear_bit(u32 msr, u8 bit)
+{
+ return __flip_bit(msr, bit, false);
+}
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0002a3a3308..20621d753d5 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -30,6 +30,7 @@ struct pg_state {
unsigned long start_address;
unsigned long current_address;
const struct addr_marker *marker;
+ bool to_dmesg;
};
struct addr_marker {
@@ -88,10 +89,28 @@ static struct addr_marker address_markers[] = {
#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
+({ \
+ if (to_dmesg) \
+ printk(KERN_INFO fmt, ##args); \
+ else \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
+
+#define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \
+({ \
+ if (to_dmesg) \
+ printk(KERN_CONT fmt, ##args); \
+ else \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
+
/*
* Print a readable form of a pgprot_t to the seq_file
*/
-static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
+static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
{
pgprotval_t pr = pgprot_val(prot);
static const char * const level_name[] =
@@ -99,47 +118,47 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
if (!pgprot_val(prot)) {
/* Not present */
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
} else {
if (pr & _PAGE_USER)
- seq_printf(m, "USR ");
+ pt_dump_cont_printf(m, dmsg, "USR ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
if (pr & _PAGE_RW)
- seq_printf(m, "RW ");
+ pt_dump_cont_printf(m, dmsg, "RW ");
else
- seq_printf(m, "ro ");
+ pt_dump_cont_printf(m, dmsg, "ro ");
if (pr & _PAGE_PWT)
- seq_printf(m, "PWT ");
+ pt_dump_cont_printf(m, dmsg, "PWT ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
if (pr & _PAGE_PCD)
- seq_printf(m, "PCD ");
+ pt_dump_cont_printf(m, dmsg, "PCD ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
/* Bit 9 has a different meaning on level 3 vs 4 */
if (level <= 3) {
if (pr & _PAGE_PSE)
- seq_printf(m, "PSE ");
+ pt_dump_cont_printf(m, dmsg, "PSE ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
} else {
if (pr & _PAGE_PAT)
- seq_printf(m, "pat ");
+ pt_dump_cont_printf(m, dmsg, "pat ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
}
if (pr & _PAGE_GLOBAL)
- seq_printf(m, "GLB ");
+ pt_dump_cont_printf(m, dmsg, "GLB ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
if (pr & _PAGE_NX)
- seq_printf(m, "NX ");
+ pt_dump_cont_printf(m, dmsg, "NX ");
else
- seq_printf(m, "x ");
+ pt_dump_cont_printf(m, dmsg, "x ");
}
- seq_printf(m, "%s\n", level_name[level]);
+ pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
}
/*
@@ -178,7 +197,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
st->current_prot = new_prot;
st->level = level;
st->marker = address_markers;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
+ st->marker->name);
} else if (prot != cur || level != st->level ||
st->current_address >= st->marker[1].start_address) {
const char *unit = units;
@@ -188,17 +208,17 @@ static void note_page(struct seq_file *m, struct pg_state *st,
/*
* Now print the actual finished series
*/
- seq_printf(m, "0x%0*lx-0x%0*lx ",
- width, st->start_address,
- width, st->current_address);
+ pt_dump_seq_printf(m, st->to_dmesg, "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, st->current_address);
delta = (st->current_address - st->start_address) >> 10;
while (!(delta & 1023) && unit[1]) {
delta >>= 10;
unit++;
}
- seq_printf(m, "%9lu%c ", delta, *unit);
- printk_prot(m, st->current_prot, st->level);
+ pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", delta, *unit);
+ printk_prot(m, st->current_prot, st->level, st->to_dmesg);
/*
* We print markers for special areas of address space,
@@ -207,7 +227,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
*/
if (st->current_address >= st->marker[1].start_address) {
st->marker++;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
+ st->marker->name);
}
st->start_address = st->current_address;
@@ -296,7 +317,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
#define pgd_none(a) pud_none(__pud(pgd_val(a)))
#endif
-static void walk_pgd_level(struct seq_file *m)
+void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
{
#ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_level4_pgt;
@@ -304,9 +325,12 @@ static void walk_pgd_level(struct seq_file *m)
pgd_t *start = swapper_pg_dir;
#endif
int i;
- struct pg_state st;
+ struct pg_state st = {};
- memset(&st, 0, sizeof(st));
+ if (pgd) {
+ start = pgd;
+ st.to_dmesg = true;
+ }
for (i = 0; i < PTRS_PER_PGD; i++) {
st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
@@ -331,7 +355,7 @@ static void walk_pgd_level(struct seq_file *m)
static int ptdump_show(struct seq_file *m, void *v)
{
- walk_pgd_level(m);
+ ptdump_walk_pgd_level(m, NULL);
return 0;
}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a10c8c79216..8e572299267 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -584,8 +584,13 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
if (error_code & PF_INSTR) {
unsigned int level;
+ pgd_t *pgd;
+ pte_t *pte;
- pte_t *pte = lookup_address(address, &level);
+ pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+ pgd += pgd_index(address);
+
+ pte = lookup_address_in_pgd(pgd, address, &level);
if (pte && pte_present(*pte) && !pte_exec(*pte))
printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index b3b19f46c01..ae242a7c11c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -126,8 +126,8 @@ within(unsigned long addr, unsigned long start, unsigned long end)
* @vaddr: virtual start address
* @size: number of bytes to flush
*
- * clflush is an unordered instruction which needs fencing with mfence
- * to avoid ordering issues.
+ * clflushopt is an unordered instruction which needs fencing with mfence or
+ * sfence to avoid ordering issues.
*/
void clflush_cache_range(void *vaddr, unsigned int size)
{
@@ -136,11 +136,11 @@ void clflush_cache_range(void *vaddr, unsigned int size)
mb();
for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
- clflush(vaddr);
+ clflushopt(vaddr);
/*
* Flush any possible final partial cacheline:
*/
- clflush(vend);
+ clflushopt(vend);
mb();
}
@@ -323,8 +323,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
return prot;
}
-static pte_t *__lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
- unsigned int *level)
+/*
+ * Lookup the page table entry for a virtual address in a specific pgd.
+ * Return a pointer to the entry and the level of the mapping.
+ */
+pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level)
{
pud_t *pud;
pmd_t *pmd;
@@ -365,7 +369,7 @@ static pte_t *__lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
*/
pte_t *lookup_address(unsigned long address, unsigned int *level)
{
- return __lookup_address_in_pgd(pgd_offset_k(address), address, level);
+ return lookup_address_in_pgd(pgd_offset_k(address), address, level);
}
EXPORT_SYMBOL_GPL(lookup_address);
@@ -373,7 +377,7 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
unsigned int *level)
{
if (cpa->pgd)
- return __lookup_address_in_pgd(cpa->pgd + pgd_index(address),
+ return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
address, level);
return lookup_address(address, level);
@@ -692,6 +696,18 @@ static bool try_to_free_pmd_page(pmd_t *pmd)
return true;
}
+static bool try_to_free_pud_page(pud_t *pud)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ if (!pud_none(pud[i]))
+ return false;
+
+ free_page((unsigned long)pud);
+ return true;
+}
+
static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
{
pte_t *pte = pte_offset_kernel(pmd, start);
@@ -805,6 +821,16 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
*/
}
+static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
+{
+ pgd_t *pgd_entry = root + pgd_index(addr);
+
+ unmap_pud_range(pgd_entry, addr, end);
+
+ if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry)))
+ pgd_clear(pgd_entry);
+}
+
static int alloc_pte_page(pmd_t *pmd)
{
pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
@@ -999,9 +1025,8 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
{
pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
- bool allocd_pgd = false;
- pgd_t *pgd_entry;
pud_t *pud = NULL; /* shut up gcc */
+ pgd_t *pgd_entry;
int ret;
pgd_entry = cpa->pgd + pgd_index(addr);
@@ -1015,7 +1040,6 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
return -1;
set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
- allocd_pgd = true;
}
pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
@@ -1023,19 +1047,11 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
ret = populate_pud(cpa, addr, pgd_entry, pgprot);
if (ret < 0) {
- unmap_pud_range(pgd_entry, addr,
+ unmap_pgd_range(cpa->pgd, addr,
addr + (cpa->numpages << PAGE_SHIFT));
-
- if (allocd_pgd) {
- /*
- * If I allocated this PUD page, I can just as well
- * free it in this error path.
- */
- pgd_clear(pgd_entry);
- free_page((unsigned long)pud);
- }
return ret;
}
+
cpa->numpages = ret;
return 0;
}
@@ -1377,10 +1393,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cache = cache_attr(mask_set);
/*
- * On success we use clflush, when the CPU supports it to
- * avoid the wbindv. If the CPU does not support it and in the
+ * On success we use CLFLUSH, when the CPU supports it to
+ * avoid the WBINVD. If the CPU does not support it and in the
* error case we fall back to cpa_flush_all (which uses
- * wbindv):
+ * WBINVD):
*/
if (!ret && cpu_has_clflush) {
if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
@@ -1861,6 +1877,12 @@ out:
return retval;
}
+void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
+ unsigned numpages)
+{
+ unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT));
+}
+
/*
* The testcases use internal knowledge of the implementation that shouldn't
* be exposed to the rest of the kernel. Include these directly here.
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index b7b0b35c198..d51045afcaa 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1,3 +1,4 @@
obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o
obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o
+obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index b97acecf3fd..3781dd39e8b 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -68,9 +68,7 @@ struct efi_memory_map memmap;
static struct efi efi_phys __initdata;
static efi_system_table_t efi_systab __initdata;
-unsigned long x86_efi_facility;
-
-static __initdata efi_config_table_type_t arch_tables[] = {
+static efi_config_table_type_t arch_tables[] __initdata = {
#ifdef CONFIG_X86_UV
{UV_SYSTEM_TABLE_GUID, "UVsystab", &efi.uv_systab},
#endif
@@ -79,16 +77,7 @@ static __initdata efi_config_table_type_t arch_tables[] = {
u64 efi_setup; /* efi setup_data physical address */
-/*
- * Returns 1 if 'facility' is enabled, 0 otherwise.
- */
-int efi_enabled(int facility)
-{
- return test_bit(facility, &x86_efi_facility) != 0;
-}
-EXPORT_SYMBOL(efi_enabled);
-
-static bool __initdata disable_runtime = false;
+static bool disable_runtime __initdata = false;
static int __init setup_noefi(char *arg)
{
disable_runtime = true;
@@ -257,27 +246,12 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
return status;
}
-static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
- efi_time_cap_t *tc)
-{
- unsigned long flags;
- efi_status_t status;
-
- spin_lock_irqsave(&rtc_lock, flags);
- efi_call_phys_prelog();
- status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm),
- virt_to_phys(tc));
- efi_call_phys_epilog();
- spin_unlock_irqrestore(&rtc_lock, flags);
- return status;
-}
-
int efi_set_rtc_mmss(const struct timespec *now)
{
unsigned long nowtime = now->tv_sec;
- efi_status_t status;
- efi_time_t eft;
- efi_time_cap_t cap;
+ efi_status_t status;
+ efi_time_t eft;
+ efi_time_cap_t cap;
struct rtc_time tm;
status = efi.get_time(&eft, &cap);
@@ -295,9 +269,8 @@ int efi_set_rtc_mmss(const struct timespec *now)
eft.second = tm.tm_sec;
eft.nanosecond = 0;
} else {
- printk(KERN_ERR
- "%s: Invalid EFI RTC value: write of %lx to EFI RTC failed\n",
- __FUNCTION__, nowtime);
+ pr_err("%s: Invalid EFI RTC value: write of %lx to EFI RTC failed\n",
+ __func__, nowtime);
return -1;
}
@@ -413,8 +386,7 @@ static void __init print_efi_memmap(void)
p < memmap.map_end;
p += memmap.desc_size, i++) {
md = p;
- pr_info("mem%02u: type=%u, attr=0x%llx, "
- "range=[0x%016llx-0x%016llx) (%lluMB)\n",
+ pr_info("mem%02u: type=%u, attr=0x%llx, range=[0x%016llx-0x%016llx) (%lluMB)\n",
i, md->type, md->attribute, md->phys_addr,
md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
(md->num_pages >> (20 - EFI_PAGE_SHIFT)));
@@ -446,9 +418,8 @@ void __init efi_reserve_boot_services(void)
memblock_is_region_reserved(start, size)) {
/* Could not reserve, skip it */
md->num_pages = 0;
- memblock_dbg("Could not reserve boot range "
- "[0x%010llx-0x%010llx]\n",
- start, start+size-1);
+ memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n",
+ start, start+size-1);
} else
memblock_reserve(start, size);
}
@@ -456,7 +427,7 @@ void __init efi_reserve_boot_services(void)
void __init efi_unmap_memmap(void)
{
- clear_bit(EFI_MEMMAP, &x86_efi_facility);
+ clear_bit(EFI_MEMMAP, &efi.flags);
if (memmap.map) {
early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
memmap.map = NULL;
@@ -467,9 +438,6 @@ void __init efi_free_boot_services(void)
{
void *p;
- if (!efi_is_native())
- return;
-
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
efi_memory_desc_t *md = p;
unsigned long long start = md->phys_addr;
@@ -584,45 +552,82 @@ static int __init efi_systab_init(void *phys)
return -EINVAL;
}
if ((efi.systab->hdr.revision >> 16) == 0)
- pr_err("Warning: System table version "
- "%d.%02d, expected 1.00 or greater!\n",
+ pr_err("Warning: System table version %d.%02d, expected 1.00 or greater!\n",
efi.systab->hdr.revision >> 16,
efi.systab->hdr.revision & 0xffff);
+ set_bit(EFI_SYSTEM_TABLES, &efi.flags);
+
return 0;
}
-static int __init efi_runtime_init(void)
+static int __init efi_runtime_init32(void)
{
- efi_runtime_services_t *runtime;
+ efi_runtime_services_32_t *runtime;
+
+ runtime = early_ioremap((unsigned long)efi.systab->runtime,
+ sizeof(efi_runtime_services_32_t));
+ if (!runtime) {
+ pr_err("Could not map the runtime service table!\n");
+ return -ENOMEM;
+ }
/*
- * Check out the runtime services table. We need to map
- * the runtime services table so that we can grab the physical
- * address of several of the EFI runtime functions, needed to
- * set the firmware into virtual mode.
+ * We will only need *early* access to the following two
+ * EFI runtime services before set_virtual_address_map
+ * is invoked.
*/
+ efi_phys.set_virtual_address_map =
+ (efi_set_virtual_address_map_t *)
+ (unsigned long)runtime->set_virtual_address_map;
+ early_iounmap(runtime, sizeof(efi_runtime_services_32_t));
+
+ return 0;
+}
+
+static int __init efi_runtime_init64(void)
+{
+ efi_runtime_services_64_t *runtime;
+
runtime = early_ioremap((unsigned long)efi.systab->runtime,
- sizeof(efi_runtime_services_t));
+ sizeof(efi_runtime_services_64_t));
if (!runtime) {
pr_err("Could not map the runtime service table!\n");
return -ENOMEM;
}
+
/*
- * We will only need *early* access to the following
- * two EFI runtime services before set_virtual_address_map
+ * We will only need *early* access to the following two
+ * EFI runtime services before set_virtual_address_map
* is invoked.
*/
- efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
efi_phys.set_virtual_address_map =
- (efi_set_virtual_address_map_t *)
- runtime->set_virtual_address_map;
+ (efi_set_virtual_address_map_t *)
+ (unsigned long)runtime->set_virtual_address_map;
+ early_iounmap(runtime, sizeof(efi_runtime_services_64_t));
+
+ return 0;
+}
+
+static int __init efi_runtime_init(void)
+{
+ int rv;
+
/*
- * Make efi_get_time can be called before entering
- * virtual mode.
+ * Check out the runtime services table. We need to map
+ * the runtime services table so that we can grab the physical
+ * address of several of the EFI runtime functions, needed to
+ * set the firmware into virtual mode.
*/
- efi.get_time = phys_efi_get_time;
- early_iounmap(runtime, sizeof(efi_runtime_services_t));
+ if (efi_enabled(EFI_64BIT))
+ rv = efi_runtime_init64();
+ else
+ rv = efi_runtime_init32();
+
+ if (rv)
+ return rv;
+
+ set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
return 0;
}
@@ -641,6 +646,8 @@ static int __init efi_memmap_init(void)
if (add_efi_memmap)
do_add_efi_memmap();
+ set_bit(EFI_MEMMAP, &efi.flags);
+
return 0;
}
@@ -723,7 +730,7 @@ void __init efi_init(void)
if (efi_systab_init(efi_phys.systab))
return;
- set_bit(EFI_SYSTEM_TABLES, &x86_efi_facility);
+ set_bit(EFI_SYSTEM_TABLES, &efi.flags);
efi.config_table = (unsigned long)efi.systab->tables;
efi.fw_vendor = (unsigned long)efi.systab->fw_vendor;
@@ -751,24 +758,21 @@ void __init efi_init(void)
if (efi_config_init(arch_tables))
return;
- set_bit(EFI_CONFIG_TABLES, &x86_efi_facility);
-
/*
* Note: We currently don't support runtime services on an EFI
* that doesn't match the kernel 32/64-bit mode.
*/
- if (!efi_is_native())
+ if (!efi_runtime_supported())
pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n");
else {
if (disable_runtime || efi_runtime_init())
return;
- set_bit(EFI_RUNTIME_SERVICES, &x86_efi_facility);
}
if (efi_memmap_init())
return;
- set_bit(EFI_MEMMAP, &x86_efi_facility);
+ set_bit(EFI_MEMMAP, &efi.flags);
print_efi_memmap();
}
@@ -845,6 +849,22 @@ void __init old_map_region(efi_memory_desc_t *md)
(unsigned long long)md->phys_addr);
}
+static void native_runtime_setup(void)
+{
+ efi.get_time = virt_efi_get_time;
+ efi.set_time = virt_efi_set_time;
+ efi.get_wakeup_time = virt_efi_get_wakeup_time;
+ efi.set_wakeup_time = virt_efi_set_wakeup_time;
+ efi.get_variable = virt_efi_get_variable;
+ efi.get_next_variable = virt_efi_get_next_variable;
+ efi.set_variable = virt_efi_set_variable;
+ efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
+ efi.reset_system = virt_efi_reset_system;
+ efi.query_variable_info = virt_efi_query_variable_info;
+ efi.update_capsule = virt_efi_update_capsule;
+ efi.query_capsule_caps = virt_efi_query_capsule_caps;
+}
+
/* Merge contiguous regions of the same type and attribute */
static void __init efi_merge_regions(void)
{
@@ -892,8 +912,9 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
}
}
-static int __init save_runtime_map(void)
+static void __init save_runtime_map(void)
{
+#ifdef CONFIG_KEXEC
efi_memory_desc_t *md;
void *tmp, *p, *q = NULL;
int count = 0;
@@ -915,38 +936,44 @@ static int __init save_runtime_map(void)
}
efi_runtime_map_setup(q, count, memmap.desc_size);
+ return;
- return 0;
out:
kfree(q);
- return -ENOMEM;
+ pr_err("Error saving runtime map, efi runtime on kexec non-functional!!\n");
+#endif
}
-/*
- * Map efi regions which were passed via setup_data. The virt_addr is a fixed
- * addr which was used in first kernel of a kexec boot.
- */
-static void __init efi_map_regions_fixed(void)
+static void *realloc_pages(void *old_memmap, int old_shift)
{
- void *p;
- efi_memory_desc_t *md;
+ void *ret;
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
- efi_map_region_fixed(md); /* FIXME: add error handling */
- get_systab_virt_addr(md);
- }
+ ret = (void *)__get_free_pages(GFP_KERNEL, old_shift + 1);
+ if (!ret)
+ goto out;
+
+ /*
+ * A first-time allocation doesn't have anything to copy.
+ */
+ if (!old_memmap)
+ return ret;
+ memcpy(ret, old_memmap, PAGE_SIZE << old_shift);
+
+out:
+ free_pages((unsigned long)old_memmap, old_shift);
+ return ret;
}
/*
- * Map efi memory ranges for runtime serivce and update new_memmap with virtual
- * addresses.
+ * Map the efi memory ranges of the runtime services and update new_mmap with
+ * virtual addresses.
*/
-static void * __init efi_map_regions(int *count)
+static void * __init efi_map_regions(int *count, int *pg_shift)
{
+ void *p, *new_memmap = NULL;
+ unsigned long left = 0;
efi_memory_desc_t *md;
- void *p, *tmp, *new_memmap = NULL;
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
@@ -961,20 +988,80 @@ static void * __init efi_map_regions(int *count)
efi_map_region(md);
get_systab_virt_addr(md);
- tmp = krealloc(new_memmap, (*count + 1) * memmap.desc_size,
- GFP_KERNEL);
- if (!tmp)
- goto out;
- new_memmap = tmp;
+ if (left < memmap.desc_size) {
+ new_memmap = realloc_pages(new_memmap, *pg_shift);
+ if (!new_memmap)
+ return NULL;
+
+ left += PAGE_SIZE << *pg_shift;
+ (*pg_shift)++;
+ }
+
memcpy(new_memmap + (*count * memmap.desc_size), md,
memmap.desc_size);
+
+ left -= memmap.desc_size;
(*count)++;
}
return new_memmap;
-out:
- kfree(new_memmap);
- return NULL;
+}
+
+static void __init kexec_enter_virtual_mode(void)
+{
+#ifdef CONFIG_KEXEC
+ efi_memory_desc_t *md;
+ void *p;
+
+ efi.systab = NULL;
+
+ /*
+ * We don't do virtual mode, since we don't do runtime services, on
+ * non-native EFI
+ */
+ if (!efi_is_native()) {
+ efi_unmap_memmap();
+ return;
+ }
+
+ /*
+ * Map efi regions which were passed via setup_data. The virt_addr is a
+ * fixed addr which was used in first kernel of a kexec boot.
+ */
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ md = p;
+ efi_map_region_fixed(md); /* FIXME: add error handling */
+ get_systab_virt_addr(md);
+ }
+
+ save_runtime_map();
+
+ BUG_ON(!efi.systab);
+
+ efi_sync_low_kernel_mappings();
+
+ /*
+ * Now that EFI is in virtual mode, update the function
+ * pointers in the runtime service table to the new virtual addresses.
+ *
+ * Call EFI services through wrapper functions.
+ */
+ efi.runtime_version = efi_systab.hdr.revision;
+
+ native_runtime_setup();
+
+ efi.set_virtual_address_map = NULL;
+
+ if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX))
+ runtime_code_page_mkexec();
+
+ /* clean DUMMY object */
+ efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
+ EFI_VARIABLE_NON_VOLATILE |
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
+ EFI_VARIABLE_RUNTIME_ACCESS,
+ 0, NULL);
+#endif
}
/*
@@ -996,57 +1083,53 @@ out:
*
* Specially for kexec boot, efi runtime maps in previous kernel should
* be passed in via setup_data. In that case runtime ranges will be mapped
- * to the same virtual addresses as the first kernel.
+ * to the same virtual addresses as the first kernel, see
+ * kexec_enter_virtual_mode().
*/
-void __init efi_enter_virtual_mode(void)
+static void __init __efi_enter_virtual_mode(void)
{
- efi_status_t status;
+ int count = 0, pg_shift = 0;
void *new_memmap = NULL;
- int err, count = 0;
+ efi_status_t status;
efi.systab = NULL;
- /*
- * We don't do virtual mode, since we don't do runtime services, on
- * non-native EFI
- */
- if (!efi_is_native()) {
- efi_unmap_memmap();
+ efi_merge_regions();
+ new_memmap = efi_map_regions(&count, &pg_shift);
+ if (!new_memmap) {
+ pr_err("Error reallocating memory, EFI runtime non-functional!\n");
return;
}
- if (efi_setup) {
- efi_map_regions_fixed();
- } else {
- efi_merge_regions();
- new_memmap = efi_map_regions(&count);
- if (!new_memmap) {
- pr_err("Error reallocating memory, EFI runtime non-functional!\n");
- return;
- }
- }
-
- err = save_runtime_map();
- if (err)
- pr_err("Error saving runtime map, efi runtime on kexec non-functional!!\n");
+ save_runtime_map();
BUG_ON(!efi.systab);
- efi_setup_page_tables();
+ if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift))
+ return;
+
efi_sync_low_kernel_mappings();
+ efi_dump_pagetable();
- if (!efi_setup) {
+ if (efi_is_native()) {
status = phys_efi_set_virtual_address_map(
- memmap.desc_size * count,
- memmap.desc_size,
- memmap.desc_version,
- (efi_memory_desc_t *)__pa(new_memmap));
-
- if (status != EFI_SUCCESS) {
- pr_alert("Unable to switch EFI into virtual mode (status=%lx)!\n",
- status);
- panic("EFI call to SetVirtualAddressMap() failed!");
- }
+ memmap.desc_size * count,
+ memmap.desc_size,
+ memmap.desc_version,
+ (efi_memory_desc_t *)__pa(new_memmap));
+ } else {
+ status = efi_thunk_set_virtual_address_map(
+ efi_phys.set_virtual_address_map,
+ memmap.desc_size * count,
+ memmap.desc_size,
+ memmap.desc_version,
+ (efi_memory_desc_t *)__pa(new_memmap));
+ }
+
+ if (status != EFI_SUCCESS) {
+ pr_alert("Unable to switch EFI into virtual mode (status=%lx)!\n",
+ status);
+ panic("EFI call to SetVirtualAddressMap() failed!");
}
/*
@@ -1056,23 +1139,43 @@ void __init efi_enter_virtual_mode(void)
* Call EFI services through wrapper functions.
*/
efi.runtime_version = efi_systab.hdr.revision;
- efi.get_time = virt_efi_get_time;
- efi.set_time = virt_efi_set_time;
- efi.get_wakeup_time = virt_efi_get_wakeup_time;
- efi.set_wakeup_time = virt_efi_set_wakeup_time;
- efi.get_variable = virt_efi_get_variable;
- efi.get_next_variable = virt_efi_get_next_variable;
- efi.set_variable = virt_efi_set_variable;
- efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
- efi.reset_system = virt_efi_reset_system;
+
+ if (efi_is_native())
+ native_runtime_setup();
+ else
+ efi_thunk_runtime_setup();
+
efi.set_virtual_address_map = NULL;
- efi.query_variable_info = virt_efi_query_variable_info;
- efi.update_capsule = virt_efi_update_capsule;
- efi.query_capsule_caps = virt_efi_query_capsule_caps;
efi_runtime_mkexec();
- kfree(new_memmap);
+ /*
+ * We mapped the descriptor array into the EFI pagetable above but we're
+ * not unmapping it here. Here's why:
+ *
+ * We're copying select PGDs from the kernel page table to the EFI page
+ * table and when we do so and make changes to those PGDs like unmapping
+ * stuff from them, those changes appear in the kernel page table and we
+ * go boom.
+ *
+ * From setup_real_mode():
+ *
+ * ...
+ * trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
+ *
+ * In this particular case, our allocation is in PGD 0 of the EFI page
+ * table but we've copied that PGD from PGD[272] of the EFI page table:
+ *
+ * pgd_index(__PAGE_OFFSET = 0xffff880000000000) = 272
+ *
+ * where the direct memory mapping in kernel space is.
+ *
+ * new_memmap's VA comes from that direct mapping and thus clearing it,
+ * it would get cleared in the kernel page table too.
+ *
+ * efi_cleanup_page_tables(__pa(new_memmap), 1 << pg_shift);
+ */
+ free_pages((unsigned long)new_memmap, pg_shift);
/* clean DUMMY object */
efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
@@ -1082,6 +1185,14 @@ void __init efi_enter_virtual_mode(void)
0, NULL);
}
+void __init efi_enter_virtual_mode(void)
+{
+ if (efi_setup)
+ kexec_enter_virtual_mode();
+ else
+ __efi_enter_virtual_mode();
+}
+
/*
* Convenience functions to obtain memory types and attributes
*/
@@ -1119,9 +1230,8 @@ u64 efi_mem_attributes(unsigned long phys_addr)
}
/*
- * Some firmware has serious problems when using more than 50% of the EFI
- * variable store, i.e. it triggers bugs that can brick machines. Ensure that
- * we never use more than this safe limit.
+ * Some firmware implementations refuse to boot if there's insufficient space
+ * in the variable store. Ensure that we never use more than a safe limit.
*
* Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable
* store.
@@ -1140,10 +1250,9 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
return status;
/*
- * Some firmware implementations refuse to boot if there's insufficient
- * space in the variable store. We account for that by refusing the
- * write if permitting it would reduce the available space to under
- * 5KB. This figure was provided by Samsung, so should be safe.
+ * We account for that by refusing the write if permitting it would
+ * reduce the available space to under 5KB. This figure was provided by
+ * Samsung, so should be safe.
*/
if ((remaining_size - size < EFI_MIN_RESERVE) &&
!efi_no_storage_paranoia) {
@@ -1206,7 +1315,7 @@ static int __init parse_efi_cmdline(char *str)
str++;
if (!strncmp(str, "old_map", 7))
- set_bit(EFI_OLD_MEMMAP, &x86_efi_facility);
+ set_bit(EFI_OLD_MEMMAP, &efi.flags);
return 0;
}
@@ -1219,7 +1328,7 @@ void __init efi_apply_memmap_quirks(void)
* firmware/kernel architectures since there is no support for runtime
* services.
*/
- if (!efi_is_native()) {
+ if (!efi_runtime_supported()) {
pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n");
efi_unmap_memmap();
}
@@ -1228,5 +1337,5 @@ void __init efi_apply_memmap_quirks(void)
* UV doesn't support the new EFI pagetable mapping yet.
*/
if (is_uv_system())
- set_bit(EFI_OLD_MEMMAP, &x86_efi_facility);
+ set_bit(EFI_OLD_MEMMAP, &efi.flags);
}
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 0b74cdf7f81..9ee3491e31f 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -40,7 +40,12 @@
static unsigned long efi_rt_eflags;
void efi_sync_low_kernel_mappings(void) {}
-void efi_setup_page_tables(void) {}
+void __init efi_dump_pagetable(void) {}
+int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
+{
+ return 0;
+}
+void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) {}
void __init efi_map_region(efi_memory_desc_t *md)
{
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 0c2a234fef1..290d397e1dd 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -39,6 +39,7 @@
#include <asm/cacheflush.h>
#include <asm/fixmap.h>
#include <asm/realmode.h>
+#include <asm/time.h>
static pgd_t *save_pgd __initdata;
static unsigned long efi_flags __initdata;
@@ -58,7 +59,8 @@ struct efi_scratch {
u64 prev_cr3;
pgd_t *efi_pgt;
bool use_pgd;
-};
+ u64 phys_stack;
+} __packed;
static void __init early_code_mapping_set_exec(int executable)
{
@@ -137,12 +139,64 @@ void efi_sync_low_kernel_mappings(void)
sizeof(pgd_t) * num_pgds);
}
-void efi_setup_page_tables(void)
+int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
{
+ unsigned long text;
+ struct page *page;
+ unsigned npages;
+ pgd_t *pgd;
+
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ return 0;
+
efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd;
+ pgd = __va(efi_scratch.efi_pgt);
- if (!efi_enabled(EFI_OLD_MEMMAP))
- efi_scratch.use_pgd = true;
+ /*
+ * It can happen that the physical address of new_memmap lands in memory
+ * which is not mapped in the EFI page table. Therefore we need to go
+ * and ident-map those pages containing the map before calling
+ * phys_efi_set_virtual_address_map().
+ */
+ if (kernel_map_pages_in_pgd(pgd, pa_memmap, pa_memmap, num_pages, _PAGE_NX)) {
+ pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap);
+ return 1;
+ }
+
+ efi_scratch.use_pgd = true;
+
+ /*
+ * When making calls to the firmware everything needs to be 1:1
+ * mapped and addressable with 32-bit pointers. Map the kernel
+ * text and allocate a new stack because we can't rely on the
+ * stack pointer being < 4GB.
+ */
+ if (!IS_ENABLED(CONFIG_EFI_MIXED))
+ return 0;
+
+ page = alloc_page(GFP_KERNEL|__GFP_DMA32);
+ if (!page)
+ panic("Unable to allocate EFI runtime stack < 4GB\n");
+
+ efi_scratch.phys_stack = virt_to_phys(page_address(page));
+ efi_scratch.phys_stack += PAGE_SIZE; /* stack grows down */
+
+ npages = (_end - _text) >> PAGE_SHIFT;
+ text = __pa(_text);
+
+ if (kernel_map_pages_in_pgd(pgd, text >> PAGE_SHIFT, text, npages, 0)) {
+ pr_err("Failed to map kernel text 1:1\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages)
+{
+ pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
+
+ kernel_unmap_pages_in_pgd(pgd, pa_memmap, num_pages);
}
static void __init __map_region(efi_memory_desc_t *md, u64 va)
@@ -173,6 +227,16 @@ void __init efi_map_region(efi_memory_desc_t *md)
*/
__map_region(md, md->phys_addr);
+ /*
+ * Enforce the 1:1 mapping as the default virtual address when
+ * booting in EFI mixed mode, because even though we may be
+ * running a 64-bit kernel, the firmware may only be 32-bit.
+ */
+ if (!efi_is_native () && IS_ENABLED(CONFIG_EFI_MIXED)) {
+ md->virt_addr = md->phys_addr;
+ return;
+ }
+
efi_va -= size;
/* Is PA 2M-aligned? */
@@ -242,3 +306,299 @@ void __init efi_runtime_mkexec(void)
if (__supported_pte_mask & _PAGE_NX)
runtime_code_page_mkexec();
}
+
+void __init efi_dump_pagetable(void)
+{
+#ifdef CONFIG_EFI_PGT_DUMP
+ pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
+
+ ptdump_walk_pgd_level(NULL, pgd);
+#endif
+}
+
+#ifdef CONFIG_EFI_MIXED
+extern efi_status_t efi64_thunk(u32, ...);
+
+#define runtime_service32(func) \
+({ \
+ u32 table = (u32)(unsigned long)efi.systab; \
+ u32 *rt, *___f; \
+ \
+ rt = (u32 *)(table + offsetof(efi_system_table_32_t, runtime)); \
+ ___f = (u32 *)(*rt + offsetof(efi_runtime_services_32_t, func)); \
+ *___f; \
+})
+
+/*
+ * Switch to the EFI page tables early so that we can access the 1:1
+ * runtime services mappings which are not mapped in any other page
+ * tables. This function must be called before runtime_service32().
+ *
+ * Also, disable interrupts because the IDT points to 64-bit handlers,
+ * which aren't going to function correctly when we switch to 32-bit.
+ */
+#define efi_thunk(f, ...) \
+({ \
+ efi_status_t __s; \
+ unsigned long flags; \
+ u32 func; \
+ \
+ efi_sync_low_kernel_mappings(); \
+ local_irq_save(flags); \
+ \
+ efi_scratch.prev_cr3 = read_cr3(); \
+ write_cr3((unsigned long)efi_scratch.efi_pgt); \
+ __flush_tlb_all(); \
+ \
+ func = runtime_service32(f); \
+ __s = efi64_thunk(func, __VA_ARGS__); \
+ \
+ write_cr3(efi_scratch.prev_cr3); \
+ __flush_tlb_all(); \
+ local_irq_restore(flags); \
+ \
+ __s; \
+})
+
+efi_status_t efi_thunk_set_virtual_address_map(
+ void *phys_set_virtual_address_map,
+ unsigned long memory_map_size,
+ unsigned long descriptor_size,
+ u32 descriptor_version,
+ efi_memory_desc_t *virtual_map)
+{
+ efi_status_t status;
+ unsigned long flags;
+ u32 func;
+
+ efi_sync_low_kernel_mappings();
+ local_irq_save(flags);
+
+ efi_scratch.prev_cr3 = read_cr3();
+ write_cr3((unsigned long)efi_scratch.efi_pgt);
+ __flush_tlb_all();
+
+ func = (u32)(unsigned long)phys_set_virtual_address_map;
+ status = efi64_thunk(func, memory_map_size, descriptor_size,
+ descriptor_version, virtual_map);
+
+ write_cr3(efi_scratch.prev_cr3);
+ __flush_tlb_all();
+ local_irq_restore(flags);
+
+ return status;
+}
+
+static efi_status_t efi_thunk_get_time(efi_time_t *tm, efi_time_cap_t *tc)
+{
+ efi_status_t status;
+ u32 phys_tm, phys_tc;
+
+ spin_lock(&rtc_lock);
+
+ phys_tm = virt_to_phys(tm);
+ phys_tc = virt_to_phys(tc);
+
+ status = efi_thunk(get_time, phys_tm, phys_tc);
+
+ spin_unlock(&rtc_lock);
+
+ return status;
+}
+
+static efi_status_t efi_thunk_set_time(efi_time_t *tm)
+{
+ efi_status_t status;
+ u32 phys_tm;
+
+ spin_lock(&rtc_lock);
+
+ phys_tm = virt_to_phys(tm);
+
+ status = efi_thunk(set_time, phys_tm);
+
+ spin_unlock(&rtc_lock);
+
+ return status;
+}
+
+static efi_status_t
+efi_thunk_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending,
+ efi_time_t *tm)
+{
+ efi_status_t status;
+ u32 phys_enabled, phys_pending, phys_tm;
+
+ spin_lock(&rtc_lock);
+
+ phys_enabled = virt_to_phys(enabled);
+ phys_pending = virt_to_phys(pending);
+ phys_tm = virt_to_phys(tm);
+
+ status = efi_thunk(get_wakeup_time, phys_enabled,
+ phys_pending, phys_tm);
+
+ spin_unlock(&rtc_lock);
+
+ return status;
+}
+
+static efi_status_t
+efi_thunk_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
+{
+ efi_status_t status;
+ u32 phys_tm;
+
+ spin_lock(&rtc_lock);
+
+ phys_tm = virt_to_phys(tm);
+
+ status = efi_thunk(set_wakeup_time, enabled, phys_tm);
+
+ spin_unlock(&rtc_lock);
+
+ return status;
+}
+
+
+static efi_status_t
+efi_thunk_get_variable(efi_char16_t *name, efi_guid_t *vendor,
+ u32 *attr, unsigned long *data_size, void *data)
+{
+ efi_status_t status;
+ u32 phys_name, phys_vendor, phys_attr;
+ u32 phys_data_size, phys_data;
+
+ phys_data_size = virt_to_phys(data_size);
+ phys_vendor = virt_to_phys(vendor);
+ phys_name = virt_to_phys(name);
+ phys_attr = virt_to_phys(attr);
+ phys_data = virt_to_phys(data);
+
+ status = efi_thunk(get_variable, phys_name, phys_vendor,
+ phys_attr, phys_data_size, phys_data);
+
+ return status;
+}
+
+static efi_status_t
+efi_thunk_set_variable(efi_char16_t *name, efi_guid_t *vendor,
+ u32 attr, unsigned long data_size, void *data)
+{
+ u32 phys_name, phys_vendor, phys_data;
+ efi_status_t status;
+
+ phys_name = virt_to_phys(name);
+ phys_vendor = virt_to_phys(vendor);
+ phys_data = virt_to_phys(data);
+
+ /* If data_size is > sizeof(u32) we've got problems */
+ status = efi_thunk(set_variable, phys_name, phys_vendor,
+ attr, data_size, phys_data);
+
+ return status;
+}
+
+static efi_status_t
+efi_thunk_get_next_variable(unsigned long *name_size,
+ efi_char16_t *name,
+ efi_guid_t *vendor)
+{
+ efi_status_t status;
+ u32 phys_name_size, phys_name, phys_vendor;
+
+ phys_name_size = virt_to_phys(name_size);
+ phys_vendor = virt_to_phys(vendor);
+ phys_name = virt_to_phys(name);
+
+ status = efi_thunk(get_next_variable, phys_name_size,
+ phys_name, phys_vendor);
+
+ return status;
+}
+
+static efi_status_t
+efi_thunk_get_next_high_mono_count(u32 *count)
+{
+ efi_status_t status;
+ u32 phys_count;
+
+ phys_count = virt_to_phys(count);
+ status = efi_thunk(get_next_high_mono_count, phys_count);
+
+ return status;
+}
+
+static void
+efi_thunk_reset_system(int reset_type, efi_status_t status,
+ unsigned long data_size, efi_char16_t *data)
+{
+ u32 phys_data;
+
+ phys_data = virt_to_phys(data);
+
+ efi_thunk(reset_system, reset_type, status, data_size, phys_data);
+}
+
+static efi_status_t
+efi_thunk_update_capsule(efi_capsule_header_t **capsules,
+ unsigned long count, unsigned long sg_list)
+{
+ /*
+ * To properly support this function we would need to repackage
+ * 'capsules' because the firmware doesn't understand 64-bit
+ * pointers.
+ */
+ return EFI_UNSUPPORTED;
+}
+
+static efi_status_t
+efi_thunk_query_variable_info(u32 attr, u64 *storage_space,
+ u64 *remaining_space,
+ u64 *max_variable_size)
+{
+ efi_status_t status;
+ u32 phys_storage, phys_remaining, phys_max;
+
+ if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+ return EFI_UNSUPPORTED;
+
+ phys_storage = virt_to_phys(storage_space);
+ phys_remaining = virt_to_phys(remaining_space);
+ phys_max = virt_to_phys(max_variable_size);
+
+ status = efi_thunk(query_variable_info, attr, phys_storage,
+ phys_remaining, phys_max);
+
+ return status;
+}
+
+static efi_status_t
+efi_thunk_query_capsule_caps(efi_capsule_header_t **capsules,
+ unsigned long count, u64 *max_size,
+ int *reset_type)
+{
+ /*
+ * To properly support this function we would need to repackage
+ * 'capsules' because the firmware doesn't understand 64-bit
+ * pointers.
+ */
+ return EFI_UNSUPPORTED;
+}
+
+void efi_thunk_runtime_setup(void)
+{
+ efi.get_time = efi_thunk_get_time;
+ efi.set_time = efi_thunk_set_time;
+ efi.get_wakeup_time = efi_thunk_get_wakeup_time;
+ efi.set_wakeup_time = efi_thunk_set_wakeup_time;
+ efi.get_variable = efi_thunk_get_variable;
+ efi.get_next_variable = efi_thunk_get_next_variable;
+ efi.set_variable = efi_thunk_set_variable;
+ efi.get_next_high_mono_count = efi_thunk_get_next_high_mono_count;
+ efi.reset_system = efi_thunk_reset_system;
+ efi.query_variable_info = efi_thunk_query_variable_info;
+ efi.update_capsule = efi_thunk_update_capsule;
+ efi.query_capsule_caps = efi_thunk_query_capsule_caps;
+}
+#endif /* CONFIG_EFI_MIXED */
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 88073b14029..e0984ef0374 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -7,6 +7,10 @@
*/
#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/msr.h>
+#include <asm/processor-flags.h>
+#include <asm/page_types.h>
#define SAVE_XMM \
mov %rsp, %rax; \
@@ -164,7 +168,169 @@ ENTRY(efi_call6)
ret
ENDPROC(efi_call6)
+#ifdef CONFIG_EFI_MIXED
+
+/*
+ * We run this function from the 1:1 mapping.
+ *
+ * This function must be invoked with a 1:1 mapped stack.
+ */
+ENTRY(__efi64_thunk)
+ movl %ds, %eax
+ push %rax
+ movl %es, %eax
+ push %rax
+ movl %ss, %eax
+ push %rax
+
+ subq $32, %rsp
+ movl %esi, 0x0(%rsp)
+ movl %edx, 0x4(%rsp)
+ movl %ecx, 0x8(%rsp)
+ movq %r8, %rsi
+ movl %esi, 0xc(%rsp)
+ movq %r9, %rsi
+ movl %esi, 0x10(%rsp)
+
+ sgdt save_gdt(%rip)
+
+ leaq 1f(%rip), %rbx
+ movq %rbx, func_rt_ptr(%rip)
+
+ /* Switch to gdt with 32-bit segments */
+ movl 64(%rsp), %eax
+ lgdt (%rax)
+
+ leaq efi_enter32(%rip), %rax
+ pushq $__KERNEL_CS
+ pushq %rax
+ lretq
+
+1: addq $32, %rsp
+
+ lgdt save_gdt(%rip)
+
+ pop %rbx
+ movl %ebx, %ss
+ pop %rbx
+ movl %ebx, %es
+ pop %rbx
+ movl %ebx, %ds
+
+ /*
+ * Convert 32-bit status code into 64-bit.
+ */
+ test %rax, %rax
+ jz 1f
+ movl %eax, %ecx
+ andl $0x0fffffff, %ecx
+ andl $0xf0000000, %eax
+ shl $32, %rax
+ or %rcx, %rax
+1:
+ ret
+ENDPROC(__efi64_thunk)
+
+ENTRY(efi_exit32)
+ movq func_rt_ptr(%rip), %rax
+ push %rax
+ mov %rdi, %rax
+ ret
+ENDPROC(efi_exit32)
+
+ .code32
+/*
+ * EFI service pointer must be in %edi.
+ *
+ * The stack should represent the 32-bit calling convention.
+ */
+ENTRY(efi_enter32)
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+
+ /* Reload pgtables */
+ movl %cr3, %eax
+ movl %eax, %cr3
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Disable long mode via EFER */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btrl $_EFER_LME, %eax
+ wrmsr
+
+ call *%edi
+
+ /* We must preserve return value */
+ movl %eax, %edi
+
+ /*
+ * Some firmware will return with interrupts enabled. Be sure to
+ * disable them before we switch GDTs.
+ */
+ cli
+
+ movl 68(%esp), %eax
+ movl %eax, 2(%eax)
+ lgdtl (%eax)
+
+ movl %cr4, %eax
+ btsl $(X86_CR4_PAE_BIT), %eax
+ movl %eax, %cr4
+
+ movl %cr3, %eax
+ movl %eax, %cr3
+
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+
+ xorl %eax, %eax
+ lldt %ax
+
+ movl 72(%esp), %eax
+ pushl $__KERNEL_CS
+ pushl %eax
+
+ /* Enable paging */
+ movl %cr0, %eax
+ btsl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+ lret
+ENDPROC(efi_enter32)
+
+ .data
+ .balign 8
+ .global efi32_boot_gdt
+efi32_boot_gdt: .word 0
+ .quad 0
+
+save_gdt: .word 0
+ .quad 0
+func_rt_ptr: .quad 0
+
+ .global efi_gdt64
+efi_gdt64:
+ .word efi_gdt64_end - efi_gdt64
+ .long 0 /* Filled out by user */
+ .word 0
+ .quad 0x0000000000000000 /* NULL descriptor */
+ .quad 0x00af9a000000ffff /* __KERNEL_CS */
+ .quad 0x00cf92000000ffff /* __KERNEL_DS */
+ .quad 0x0080890000000000 /* TS descriptor */
+ .quad 0x0000000000000000 /* TS continued */
+efi_gdt64_end:
+#endif /* CONFIG_EFI_MIXED */
+
.data
ENTRY(efi_scratch)
.fill 3,8,0
.byte 0
+ .quad 0
diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S
new file mode 100644
index 00000000000..8806fa73e6e
--- /dev/null
+++ b/arch/x86/platform/efi/efi_thunk_64.S
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2014 Intel Corporation; author Matt Fleming
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+
+ .text
+ .code64
+ENTRY(efi64_thunk)
+ push %rbp
+ push %rbx
+
+ /*
+ * Switch to 1:1 mapped 32-bit stack pointer.
+ */
+ movq %rsp, efi_saved_sp(%rip)
+ movq efi_scratch+25(%rip), %rsp
+
+ /*
+ * Calculate the physical address of the kernel text.
+ */
+ movq $__START_KERNEL_map, %rax
+ subq phys_base(%rip), %rax
+
+ /*
+ * Push some physical addresses onto the stack. This is easier
+ * to do now in a code64 section while the assembler can address
+ * 64-bit values. Note that all the addresses on the stack are
+ * 32-bit.
+ */
+ subq $16, %rsp
+ leaq efi_exit32(%rip), %rbx
+ subq %rax, %rbx
+ movl %ebx, 8(%rsp)
+ leaq efi_gdt64(%rip), %rbx
+ subq %rax, %rbx
+ movl %ebx, 2(%ebx)
+ movl %ebx, 4(%rsp)
+ leaq efi_gdt32(%rip), %rbx
+ subq %rax, %rbx
+ movl %ebx, 2(%ebx)
+ movl %ebx, (%rsp)
+
+ leaq __efi64_thunk(%rip), %rbx
+ subq %rax, %rbx
+ call *%rbx
+
+ movq efi_saved_sp(%rip), %rsp
+ pop %rbx
+ pop %rbp
+ retq
+ENDPROC(efi64_thunk)
+
+ .data
+efi_gdt32:
+ .word efi_gdt32_end - efi_gdt32
+ .long 0 /* Filled out above */
+ .word 0
+ .quad 0x0000000000000000 /* NULL descriptor */
+ .quad 0x00cf9a000000ffff /* __KERNEL_CS */
+ .quad 0x00cf93000000ffff /* __KERNEL_DS */
+efi_gdt32_end:
+
+efi_saved_sp: .quad 0
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 256282e7888..2423ef04ffe 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -365,7 +365,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
/* Assume pteval_t is equivalent to all the other *val_t types. */
static pteval_t pte_mfn_to_pfn(pteval_t val)
{
- if (pteval_present(val)) {
+ if (val & _PAGE_PRESENT) {
unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
unsigned long pfn = mfn_to_pfn(mfn);
@@ -381,7 +381,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
static pteval_t pte_pfn_to_mfn(pteval_t val)
{
- if (pteval_present(val)) {
+ if (val & _PAGE_PRESENT) {
unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
pteval_t flags = val & PTE_FLAGS_MASK;
unsigned long mfn;