diff options
Diffstat (limited to 'arch/x86')
336 files changed, 16929 insertions, 9231 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild new file mode 100644 index 00000000000..ad8ec356fb3 --- /dev/null +++ b/arch/x86/Kbuild @@ -0,0 +1,16 @@ + +obj-$(CONFIG_KVM) += kvm/ + +# Xen paravirtualization support +obj-$(CONFIG_XEN) += xen/ + +# lguest paravirtualization support +obj-$(CONFIG_LGUEST_GUEST) += lguest/ + +obj-y += kernel/ +obj-y += mm/ + +obj-y += crypto/ +obj-y += vdso/ +obj-$(CONFIG_IA32_EMULATION) += ia32/ + diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a6efe0a2e9a..13ffa5df37d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,15 +24,18 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE + select HAVE_PERF_COUNTERS if (!M386 && !M486) select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_FRAME_POINTERS + select HAVE_DMA_ATTRS select HAVE_KRETPROBES select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select HAVE_FTRACE_SYSCALLS @@ -46,6 +49,12 @@ config X86 select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA + select HAVE_ARCH_KMEMCHECK + +config OUTPUT_FORMAT + string + default "elf32-i386" if X86_32 + default "elf64-x86-64" if X86_64 config ARCH_DEFCONFIG string @@ -274,15 +283,9 @@ config SPARSE_IRQ If you don't know what to do here, say N. -config NUMA_MIGRATE_IRQ_DESC - bool "Move irq desc when changing irq smp_affinity" +config NUMA_IRQ_DESC + def_bool y depends on SPARSE_IRQ && NUMA - depends on BROKEN - default n - ---help--- - This enables moving irq_desc to cpu/node that irq will use handled. - - If you don't know what to do here, say N. config X86_MPPARSE bool "Enable MPS table" if ACPI @@ -355,7 +358,7 @@ config X86_UV depends on X86_64 depends on X86_EXTENDED_PLATFORM depends on NUMA - select X86_X2APIC + depends on X86_X2APIC ---help--- This option is needed in order to support SGI Ultraviolet systems. If you don't have one of these, you should say N here. @@ -789,10 +792,26 @@ config X86_MCE to disable it. MCE support simply ignores non-MCE processors like the 386 and 486, so nearly everyone can say Y here. +config X86_OLD_MCE + depends on X86_32 && X86_MCE + bool "Use legacy machine check code (will go away)" + default n + select X86_ANCIENT_MCE + ---help--- + Use the old i386 machine check code. This is merely intended for + testing in a transition period. Try this if you run into any machine + check related software problems, but report the problem to + linux-kernel. When in doubt say no. + +config X86_NEW_MCE + depends on X86_MCE + bool + default y if (!X86_OLD_MCE && X86_32) || X86_64 + config X86_MCE_INTEL def_bool y prompt "Intel MCE features" - depends on X86_64 && X86_MCE && X86_LOCAL_APIC + depends on X86_NEW_MCE && X86_LOCAL_APIC ---help--- Additional support for intel specific MCE features such as the thermal monitor. @@ -800,19 +819,36 @@ config X86_MCE_INTEL config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_64 && X86_MCE && X86_LOCAL_APIC + depends on X86_NEW_MCE && X86_LOCAL_APIC ---help--- Additional support for AMD specific MCE features such as the DRAM Error Threshold. +config X86_ANCIENT_MCE + def_bool n + depends on X86_32 + prompt "Support for old Pentium 5 / WinChip machine checks" + ---help--- + Include support for machine check handling on old Pentium 5 or WinChip + systems. These typically need to be enabled explicitely on the command + line. + config X86_MCE_THRESHOLD depends on X86_MCE_AMD || X86_MCE_INTEL bool default y +config X86_MCE_INJECT + depends on X86_NEW_MCE + tristate "Machine check injector support" + ---help--- + Provide support for injecting machine checks for testing purposes. + If you don't know what a machine check is and you don't do kernel + QA it is safe to say n. + config X86_MCE_NONFATAL tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" - depends on X86_32 && X86_MCE + depends on X86_OLD_MCE ---help--- Enabling this feature starts a timer that triggers every 5 seconds which will look at the machine check registers to see if anything happened. @@ -825,11 +861,15 @@ config X86_MCE_NONFATAL config X86_MCE_P4THERMAL bool "check for P4 thermal throttling interrupt." - depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) + depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP) ---help--- Enabling this feature will cause a message to be printed when the P4 enters thermal throttling. +config X86_THERMAL_VECTOR + def_bool y + depends on X86_MCE_P4THERMAL || X86_MCE_INTEL + config VM86 bool "Enable VM86 support" if EMBEDDED default y @@ -1466,9 +1506,7 @@ config KEXEC_JUMP config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) - default "0x1000000" if X86_NUMAQ - default "0x200000" if X86_64 - default "0x100000" + default "0x1000000" ---help--- This gives the physical address where the kernel is loaded. @@ -1487,15 +1525,15 @@ config PHYSICAL_START to be specifically compiled to run from a specific memory area (normally a reserved region) and this option comes handy. - So if you are using bzImage for capturing the crash dump, leave - the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y. - Otherwise if you plan to use vmlinux for capturing the crash dump - change this value to start of the reserved region (Typically 16MB - 0x1000000). In other words, it can be set based on the "X" value as - specified in the "crashkernel=YM@XM" command line boot parameter - passed to the panic-ed kernel. Typically this parameter is set as - crashkernel=64M@16M. Please take a look at - Documentation/kdump/kdump.txt for more details about crash dumps. + So if you are using bzImage for capturing the crash dump, + leave the value here unchanged to 0x1000000 and set + CONFIG_RELOCATABLE=y. Otherwise if you plan to use vmlinux + for capturing the crash dump change this value to start of + the reserved region. In other words, it can be set based on + the "X" value as specified in the "crashkernel=YM@XM" + command line boot parameter passed to the panic-ed + kernel. Please take a look at Documentation/kdump/kdump.txt + for more details about crash dumps. Usage of bzImage for capturing the crash dump is recommended as one does not have to build two kernels. Same kernel can be used @@ -1508,8 +1546,8 @@ config PHYSICAL_START Don't change this unless you know what you are doing. config RELOCATABLE - bool "Build a relocatable kernel (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "Build a relocatable kernel" + default y ---help--- This builds a kernel image that retains relocation information so it can be loaded someplace besides the default 1MB. @@ -1524,12 +1562,16 @@ config RELOCATABLE it has been loaded at and the compile time physical address (CONFIG_PHYSICAL_START) is ignored. +# Relocation on x86-32 needs some additional build support +config X86_NEED_RELOCS + def_bool y + depends on X86_32 && RELOCATABLE + config PHYSICAL_ALIGN hex prompt "Alignment value to which kernel should be aligned" if X86_32 - default "0x100000" if X86_32 - default "0x200000" if X86_64 - range 0x2000 0x400000 + default "0x1000000" + range 0x2000 0x1000000 ---help--- This value puts the alignment restrictions on physical address where kernel is loaded and run from. Kernel is compiled for an @@ -1871,25 +1913,26 @@ config DMAR_DEFAULT_ON recommended you say N here while the DMAR code remains experimental. -config DMAR_GFX_WA - def_bool y - prompt "Support for Graphics workaround" +config DMAR_BROKEN_GFX_WA + def_bool n + prompt "Workaround broken graphics drivers (going away soon)" depends on DMAR ---help--- Current Graphics drivers tend to use physical address for DMA and avoid using DMA APIs. Setting this config option permits the IOMMU driver to set a unity map for all the OS-visible memory. Hence the driver can continue - to use physical addresses for DMA. + to use physical addresses for DMA, at least until this + option is removed in the 2.6.32 kernel. config DMAR_FLOPPY_WA def_bool y depends on DMAR ---help--- - Floppy disk drivers are know to bypass DMA API calls + Floppy disk drivers are known to bypass DMA API calls thereby failing to work when IOMMU is enabled. This workaround will setup a 1:1 mapping for the first - 16M to make floppy (an ISA device) work. + 16MiB to make floppy (an ISA device) work. config INTR_REMAP bool "Support for Interrupt Remapping (EXPERIMENTAL)" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index d8359e73317..d105f29bb6b 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -159,14 +159,30 @@ config IOMMU_DEBUG options. See Documentation/x86_64/boot-options.txt for more details. +config IOMMU_STRESS + bool "Enable IOMMU stress-test mode" + ---help--- + This option disables various optimizations in IOMMU related + code to do real stress testing of the IOMMU code. This option + will cause a performance drop and should only be enabled for + testing. + config IOMMU_LEAK bool "IOMMU leak tracing" - depends on DEBUG_KERNEL - depends on IOMMU_DEBUG + depends on IOMMU_DEBUG && DMA_API_DEBUG ---help--- Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. +config X86_DS_SELFTEST + bool "DS selftest" + default y + depends on DEBUG_KERNEL + depends on X86_DS + ---help--- + Perform Debug Store selftests at boot time. + If in doubt, say "N". + config HAVE_MMIOTRACE_SUPPORT def_bool y diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 8c86b72afdc..1b68659c41b 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -7,8 +7,6 @@ else KBUILD_DEFCONFIG := $(ARCH)_defconfig endif -core-$(CONFIG_KVM) += arch/x86/kvm/ - # BITS is used as extension for files which are available in a 32 bit # and a 64 bit version to simplify shared Makefiles. # e.g.: obj-y += foo_$(BITS).o @@ -83,6 +81,11 @@ ifdef CONFIG_CC_STACKPROTECTOR endif endif +# Don't unroll struct assignments with kmemcheck enabled +ifeq ($(CONFIG_KMEMCHECK),y) + KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy) +endif + # Stackpointer is addressed different for 32 bit and 64 bit x86 sp-$(CONFIG_X86_32) := esp sp-$(CONFIG_X86_64) := rsp @@ -118,21 +121,8 @@ head-y += arch/x86/kernel/init_task.o libs-y += arch/x86/lib/ -# Sub architecture files that needs linking first -core-y += $(fcore-y) - -# Xen paravirtualization support -core-$(CONFIG_XEN) += arch/x86/xen/ - -# lguest paravirtualization support -core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/ - -core-y += arch/x86/kernel/ -core-y += arch/x86/mm/ - -core-y += arch/x86/crypto/ -core-y += arch/x86/vdso/ -core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/ +# See arch/x86/Kbuild for content of core part of the kernel +core-y += arch/x86/ # drivers-y are linked after core-y drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore index 172cf8a98bd..851fe936d24 100644 --- a/arch/x86/boot/.gitignore +++ b/arch/x86/boot/.gitignore @@ -3,6 +3,8 @@ bzImage cpustr.h mkcpustr offsets.h +voffset.h +zoffset.h setup setup.bin setup.elf diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 6633b6e7505..ec749c2bfdd 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -26,9 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage targets += fdimage fdimage144 fdimage288 image.iso mtools.conf subdir- := compressed -setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o +setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o setup-y += header.o main.o mca.o memory.o pm.o pmjump.o -setup-y += printf.o string.o tty.o video.o video-mode.o version.o +setup-y += printf.o regs.o string.o tty.o video.o video-mode.o +setup-y += version.o setup-$(CONFIG_X86_APM_BOOT) += apm.o # The link order of the video-*.o modules can matter. In particular, @@ -69,6 +70,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ $(call cc-option, -mpreferred-stack-boundary=2) KBUILD_CFLAGS += $(call cc-option, -m32) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ +GCOV_PROFILE := n $(obj)/bzImage: asflags-y := $(SVGA_MODE) @@ -86,19 +88,27 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-offsets := -e 's/^00*/0/' \ - -e 's/^\([0-9a-fA-F]*\) . \(input_data\|input_data_end\)$$/\#define \2 0x\1/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p' -quiet_cmd_offsets = OFFSETS $@ - cmd_offsets = $(NM) $< | sed -n $(sed-offsets) > $@ +quiet_cmd_voffset = VOFFSET $@ + cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ -$(obj)/offsets.h: $(obj)/compressed/vmlinux FORCE - $(call if_changed,offsets) +targets += voffset.h +$(obj)/voffset.h: vmlinux FORCE + $(call if_changed,voffset) + +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' + +quiet_cmd_zoffset = ZOFFSET $@ + cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ + +targets += zoffset.h +$(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE + $(call if_changed,zoffset) -targets += offsets.h AFLAGS_header.o += -I$(obj) -$(obj)/header.o: $(obj)/offsets.h +$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h LDFLAGS_setup.elf := -T $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c index 7c19ce8c244..64a31a6d751 100644 --- a/arch/x86/boot/a20.c +++ b/arch/x86/boot/a20.c @@ -2,7 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007-2008 rPath, Inc. - All Rights Reserved - * Copyright 2009 Intel Corporation + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -90,8 +90,11 @@ static int a20_test_long(void) static void enable_a20_bios(void) { - asm volatile("pushfl; int $0x15; popfl" - : : "a" ((u16)0x2401)); + struct biosregs ireg; + + initregs(&ireg); + ireg.ax = 0x2401; + intcall(0x15, &ireg, NULL); } static void enable_a20_kbc(void) diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c index 7aa6033001f..ee274834ea8 100644 --- a/arch/x86/boot/apm.c +++ b/arch/x86/boot/apm.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * Original APM BIOS checking by Stephen Rothwell, May 1994 * (sfr@canb.auug.org.au) @@ -19,75 +20,56 @@ int query_apm_bios(void) { - u16 ax, bx, cx, dx, di; - u32 ebx, esi; - u8 err; + struct biosregs ireg, oreg; /* APM BIOS installation check */ - ax = 0x5300; - bx = cx = 0; - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0" - : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx) - : : "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x53; + intcall(0x15, &ireg, &oreg); - if (err) + if (oreg.flags & X86_EFLAGS_CF) return -1; /* No APM BIOS */ - if (bx != 0x504d) /* "PM" signature */ + if (oreg.bx != 0x504d) /* "PM" signature */ return -1; - if (!(cx & 0x02)) /* 32 bits supported? */ + if (!(oreg.cx & 0x02)) /* 32 bits supported? */ return -1; /* Disconnect first, just in case */ - ax = 0x5304; - bx = 0; - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp" - : "+a" (ax), "+b" (bx) - : : "ecx", "edx", "esi", "edi"); - - /* Paranoia */ - ebx = esi = 0; - cx = dx = di = 0; + ireg.al = 0x04; + intcall(0x15, &ireg, NULL); /* 32-bit connect */ - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6" - : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx), - "+S" (esi), "+D" (di), "=m" (err) - : "a" (0x5303)); - - boot_params.apm_bios_info.cseg = ax; - boot_params.apm_bios_info.offset = ebx; - boot_params.apm_bios_info.cseg_16 = cx; - boot_params.apm_bios_info.dseg = dx; - boot_params.apm_bios_info.cseg_len = (u16)esi; - boot_params.apm_bios_info.cseg_16_len = esi >> 16; - boot_params.apm_bios_info.dseg_len = di; - - if (err) + ireg.al = 0x03; + intcall(0x15, &ireg, &oreg); + + boot_params.apm_bios_info.cseg = oreg.ax; + boot_params.apm_bios_info.offset = oreg.ebx; + boot_params.apm_bios_info.cseg_16 = oreg.cx; + boot_params.apm_bios_info.dseg = oreg.dx; + boot_params.apm_bios_info.cseg_len = oreg.si; + boot_params.apm_bios_info.cseg_16_len = oreg.hsi; + boot_params.apm_bios_info.dseg_len = oreg.di; + + if (oreg.flags & X86_EFLAGS_CF) return -1; /* Redo the installation check as the 32-bit connect; some BIOSes return different flags this way... */ - ax = 0x5300; - bx = cx = 0; - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0" - : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx) - : : "esi", "edi"); + ireg.al = 0x00; + intcall(0x15, &ireg, &oreg); - if (err || bx != 0x504d) { + if ((oreg.eflags & X86_EFLAGS_CF) || oreg.bx != 0x504d) { /* Failure with 32-bit connect, try to disconect and ignore */ - ax = 0x5304; - bx = 0; - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp" - : "+a" (ax), "+b" (bx) - : : "ecx", "edx", "esi", "edi"); + ireg.al = 0x04; + intcall(0x15, &ireg, NULL); return -1; } - boot_params.apm_bios_info.version = ax; - boot_params.apm_bios_info.flags = cx; + boot_params.apm_bios_info.version = oreg.ax; + boot_params.apm_bios_info.flags = oreg.cx; return 0; } diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S new file mode 100644 index 00000000000..1dfbf64e52a --- /dev/null +++ b/arch/x86/boot/bioscall.S @@ -0,0 +1,82 @@ +/* ----------------------------------------------------------------------- + * + * Copyright 2009 Intel Corporation; author H. Peter Anvin + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * "Glove box" for BIOS calls. Avoids the constant problems with BIOSes + * touching registers they shouldn't be. + */ + + .code16gcc + .text + .globl intcall + .type intcall, @function +intcall: + /* Self-modify the INT instruction. Ugly, but works. */ + cmpb %al, 3f + je 1f + movb %al, 3f + jmp 1f /* Synchronize pipeline */ +1: + /* Save state */ + pushfl + pushw %fs + pushw %gs + pushal + + /* Copy input state to stack frame */ + subw $44, %sp + movw %dx, %si + movw %sp, %di + movw $11, %cx + rep; movsd + + /* Pop full state from the stack */ + popal + popw %gs + popw %fs + popw %es + popw %ds + popfl + + /* Actual INT */ + .byte 0xcd /* INT opcode */ +3: .byte 0 + + /* Push full state to the stack */ + pushfl + pushw %ds + pushw %es + pushw %fs + pushw %gs + pushal + + /* Re-establish C environment invariants */ + cld + movzwl %sp, %esp + movw %cs, %ax + movw %ax, %ds + movw %ax, %es + + /* Copy output state from stack frame */ + movw 68(%esp), %di /* Original %cx == 3rd argument */ + andw %di, %di + jz 4f + movw %sp, %si + movw $11, %cx + rep; movsd +4: addw $44, %sp + + /* Restore state and return */ + popal + popw %gs + popw %fs + popfl + retl + .size intcall, .-intcall diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 7b2692e897e..98239d2658f 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -26,6 +27,7 @@ #include <asm/setup.h> #include "bitops.h" #include <asm/cpufeature.h> +#include <asm/processor-flags.h> /* Useful macros */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) @@ -241,6 +243,49 @@ int enable_a20(void); /* apm.c */ int query_apm_bios(void); +/* bioscall.c */ +struct biosregs { + union { + struct { + u32 edi; + u32 esi; + u32 ebp; + u32 _esp; + u32 ebx; + u32 edx; + u32 ecx; + u32 eax; + u32 _fsgs; + u32 _dses; + u32 eflags; + }; + struct { + u16 di, hdi; + u16 si, hsi; + u16 bp, hbp; + u16 _sp, _hsp; + u16 bx, hbx; + u16 dx, hdx; + u16 cx, hcx; + u16 ax, hax; + u16 gs, fs; + u16 es, ds; + u16 flags, hflags; + }; + struct { + u8 dil, dih, edi2, edi3; + u8 sil, sih, esi2, esi3; + u8 bpl, bph, ebp2, ebp3; + u8 _spl, _sph, _esp2, _esp3; + u8 bl, bh, ebx2, ebx3; + u8 dl, dh, edx2, edx3; + u8 cl, ch, ecx2, ecx3; + u8 al, ah, eax2, eax3; + }; + }; +}; +void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg); + /* cmdline.c */ int cmdline_find_option(const char *option, char *buffer, int bufsize); int cmdline_find_option_bool(const char *option); @@ -279,6 +324,9 @@ int sprintf(char *buf, const char *fmt, ...); int vsprintf(char *buf, const char *fmt, va_list args); int printf(const char *fmt, ...); +/* regs.c */ +void initregs(struct biosregs *regs); + /* string.c */ int strcmp(const char *str1, const char *str2); size_t strnlen(const char *s, size_t maxlen); diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore index 63eff3b04d0..4a46fab7162 100644 --- a/arch/x86/boot/compressed/.gitignore +++ b/arch/x86/boot/compressed/.gitignore @@ -1,3 +1,6 @@ relocs vmlinux.bin.all vmlinux.relocs +vmlinux.lds +mkpiggy +piggy.S diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 65551c9f857..f8ed0658404 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -4,7 +4,7 @@ # create a compressed vmlinux image from the original vmlinux # -targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o +targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC @@ -15,11 +15,14 @@ KBUILD_CFLAGS += $(call cc-option,-ffreestanding) KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ +GCOV_PROFILE := n LDFLAGS := -m elf_$(UTS_MACHINE) LDFLAGS_vmlinux := -T -$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE +hostprogs-y := mkpiggy + +$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE $(call if_changed,ld) @: @@ -29,7 +32,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += vmlinux.bin.all vmlinux.relocs relocs -hostprogs-$(CONFIG_X86_32) += relocs +hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< @@ -37,46 +40,22 @@ $(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE $(call if_changed,relocs) vmlinux.bin.all-y := $(obj)/vmlinux.bin -vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs -quiet_cmd_relocbin = BUILD $@ - cmd_relocbin = cat $(filter-out FORCE,$^) > $@ -$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE - $(call if_changed,relocbin) - -ifeq ($(CONFIG_X86_32),y) +vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs -ifdef CONFIG_RELOCATABLE -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE - $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE - $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE - $(call if_changed,lzma) -else -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE +$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE +$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE +$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzma) -endif -LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T -else +suffix-$(CONFIG_KERNEL_GZIP) := gz +suffix-$(CONFIG_KERNEL_BZIP2) := bz2 +suffix-$(CONFIG_KERNEL_LZMA) := lzma -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE - $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE - $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE - $(call if_changed,lzma) - -LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T -endif +quiet_cmd_mkpiggy = MKPIGGY $@ + cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) -suffix_$(CONFIG_KERNEL_GZIP) = gz -suffix_$(CONFIG_KERNEL_BZIP2) = bz2 -suffix_$(CONFIG_KERNEL_LZMA) = lzma - -$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE - $(call if_changed,ld) +targets += piggy.S +$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE + $(call if_changed,mkpiggy) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 3a8a866fb2e..75e4f001e70 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -12,16 +12,16 @@ * the page directory. [According to comments etc elsewhere on a compressed * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC] * - * Page 0 is deliberately kept safe, since System Management Mode code in + * Page 0 is deliberately kept safe, since System Management Mode code in * laptops may need to access the BIOS data stored there. This is also - * useful for future device drivers that either access the BIOS via VM86 + * useful for future device drivers that either access the BIOS via VM86 * mode. */ /* * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ -.text + .text #include <linux/linkage.h> #include <asm/segment.h> @@ -29,161 +29,151 @@ #include <asm/boot.h> #include <asm/asm-offsets.h> -.section ".text.head","ax",@progbits + .section ".text.head","ax",@progbits ENTRY(startup_32) cld - /* test KEEP_SEGMENTS flag to see if the bootloader is asking - * us to not reload segments */ - testb $(1<<6), BP_loadflags(%esi) - jnz 1f + /* + * Test KEEP_SEGMENTS flag to see if the bootloader is asking + * us to not reload segments + */ + testb $(1<<6), BP_loadflags(%esi) + jnz 1f cli - movl $(__BOOT_DS),%eax - movl %eax,%ds - movl %eax,%es - movl %eax,%fs - movl %eax,%gs - movl %eax,%ss + movl $__BOOT_DS, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %fs + movl %eax, %gs + movl %eax, %ss 1: -/* Calculate the delta between where we were compiled to run +/* + * Calculate the delta between where we were compiled to run * at and where we were actually loaded at. This can only be done * with a short local call on x86. Nothing else will tell us what * address we are running at. The reserved chunk of the real-mode * data at 0x1e4 (defined as a scratch field) are used as the stack * for this calculation. Only 4 bytes are needed. */ - leal (0x1e4+4)(%esi), %esp - call 1f -1: popl %ebp - subl $1b, %ebp + leal (BP_scratch+4)(%esi), %esp + call 1f +1: popl %ebp + subl $1b, %ebp -/* %ebp contains the address we are loaded at by the boot loader and %ebx +/* + * %ebp contains the address we are loaded at by the boot loader and %ebx * contains the address where we should move the kernel image temporarily * for safe in-place decompression. */ #ifdef CONFIG_RELOCATABLE - movl %ebp, %ebx - addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx - andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx + movl %ebp, %ebx + movl BP_kernel_alignment(%esi), %eax + decl %eax + addl %eax, %ebx + notl %eax + andl %eax, %ebx #else - movl $LOAD_PHYSICAL_ADDR, %ebx + movl $LOAD_PHYSICAL_ADDR, %ebx #endif - /* Replace the compressed data size with the uncompressed size */ - subl input_len(%ebp), %ebx - movl output_len(%ebp), %eax - addl %eax, %ebx - /* Add 8 bytes for every 32K input block */ - shrl $12, %eax - addl %eax, %ebx - /* Add 32K + 18 bytes of extra slack */ - addl $(32768 + 18), %ebx - /* Align on a 4K boundary */ - addl $4095, %ebx - andl $~4095, %ebx - -/* Copy the compressed kernel to the end of our buffer + /* Target address to relocate to for decompression */ + addl $z_extract_offset, %ebx + + /* Set up the stack */ + leal boot_stack_end(%ebx), %esp + + /* Zero EFLAGS */ + pushl $0 + popfl + +/* + * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ - pushl %esi - leal _end(%ebp), %esi - leal _end(%ebx), %edi - movl $(_end - startup_32), %ecx + pushl %esi + leal (_bss-4)(%ebp), %esi + leal (_bss-4)(%ebx), %edi + movl $(_bss - startup_32), %ecx + shrl $2, %ecx std - rep - movsb + rep movsl cld - popl %esi - -/* Compute the kernel start address. - */ -#ifdef CONFIG_RELOCATABLE - addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp - andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp -#else - movl $LOAD_PHYSICAL_ADDR, %ebp -#endif + popl %esi /* * Jump to the relocated address. */ - leal relocated(%ebx), %eax - jmp *%eax + leal relocated(%ebx), %eax + jmp *%eax ENDPROC(startup_32) -.section ".text" + .text relocated: /* - * Clear BSS - */ - xorl %eax,%eax - leal _edata(%ebx),%edi - leal _end(%ebx), %ecx - subl %edi,%ecx - cld - rep - stosb - -/* - * Setup the stack for the decompressor + * Clear BSS (stack is currently empty) */ - leal boot_stack_end(%ebx), %esp + xorl %eax, %eax + leal _bss(%ebx), %edi + leal _ebss(%ebx), %ecx + subl %edi, %ecx + shrl $2, %ecx + rep stosl /* * Do the decompression, and jump to the new kernel.. */ - movl output_len(%ebx), %eax - pushl %eax - # push arguments for decompress_kernel: - pushl %ebp # output address - movl input_len(%ebx), %eax - pushl %eax # input_len - leal input_data(%ebx), %eax - pushl %eax # input_data - leal boot_heap(%ebx), %eax - pushl %eax # heap area - pushl %esi # real mode pointer - call decompress_kernel - addl $20, %esp - popl %ecx + leal z_extract_offset_negative(%ebx), %ebp + /* push arguments for decompress_kernel: */ + pushl %ebp /* output address */ + pushl $z_input_len /* input_len */ + leal input_data(%ebx), %eax + pushl %eax /* input_data */ + leal boot_heap(%ebx), %eax + pushl %eax /* heap area */ + pushl %esi /* real mode pointer */ + call decompress_kernel + addl $20, %esp #if CONFIG_RELOCATABLE -/* Find the address of the relocations. +/* + * Find the address of the relocations. */ - movl %ebp, %edi - addl %ecx, %edi + leal z_output_len(%ebp), %edi -/* Calculate the delta between where vmlinux was compiled to run +/* + * Calculate the delta between where vmlinux was compiled to run * and where it was actually loaded. */ - movl %ebp, %ebx - subl $LOAD_PHYSICAL_ADDR, %ebx - jz 2f /* Nothing to be done if loaded at compiled addr. */ + movl %ebp, %ebx + subl $LOAD_PHYSICAL_ADDR, %ebx + jz 2f /* Nothing to be done if loaded at compiled addr. */ /* * Process relocations. */ -1: subl $4, %edi - movl 0(%edi), %ecx - testl %ecx, %ecx - jz 2f - addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) - jmp 1b +1: subl $4, %edi + movl (%edi), %ecx + testl %ecx, %ecx + jz 2f + addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) + jmp 1b 2: #endif /* * Jump to the decompressed kernel. */ - xorl %ebx,%ebx - jmp *%ebp + xorl %ebx, %ebx + jmp *%ebp -.bss -/* Stack and heap for uncompression */ -.balign 4 +/* + * Stack and heap for uncompression + */ + .bss + .balign 4 boot_heap: .fill BOOT_HEAP_SIZE, 1, 0 boot_stack: diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index ed4a8294800..f62c284db9e 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -21,8 +21,8 @@ /* * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ -.code32 -.text + .code32 + .text #include <linux/linkage.h> #include <asm/segment.h> @@ -33,12 +33,14 @@ #include <asm/processor-flags.h> #include <asm/asm-offsets.h> -.section ".text.head" + .section ".text.head" .code32 ENTRY(startup_32) cld - /* test KEEP_SEGMENTS flag to see if the bootloader is asking - * us to not reload segments */ + /* + * Test KEEP_SEGMENTS flag to see if the bootloader is asking + * us to not reload segments + */ testb $(1<<6), BP_loadflags(%esi) jnz 1f @@ -49,14 +51,15 @@ ENTRY(startup_32) movl %eax, %ss 1: -/* Calculate the delta between where we were compiled to run +/* + * Calculate the delta between where we were compiled to run * at and where we were actually loaded at. This can only be done * with a short local call on x86. Nothing else will tell us what * address we are running at. The reserved chunk of the real-mode * data at 0x1e4 (defined as a scratch field) are used as the stack * for this calculation. Only 4 bytes are needed. */ - leal (0x1e4+4)(%esi), %esp + leal (BP_scratch+4)(%esi), %esp call 1f 1: popl %ebp subl $1b, %ebp @@ -70,32 +73,28 @@ ENTRY(startup_32) testl %eax, %eax jnz no_longmode -/* Compute the delta between where we were compiled to run at +/* + * Compute the delta between where we were compiled to run at * and where the code will actually run at. - */ -/* %ebp contains the address we are loaded at by the boot loader and %ebx + * + * %ebp contains the address we are loaded at by the boot loader and %ebx * contains the address where we should move the kernel image temporarily * for safe in-place decompression. */ #ifdef CONFIG_RELOCATABLE movl %ebp, %ebx - addl $(PMD_PAGE_SIZE -1), %ebx - andl $PMD_PAGE_MASK, %ebx + movl BP_kernel_alignment(%esi), %eax + decl %eax + addl %eax, %ebx + notl %eax + andl %eax, %ebx #else - movl $CONFIG_PHYSICAL_START, %ebx + movl $LOAD_PHYSICAL_ADDR, %ebx #endif - /* Replace the compressed data size with the uncompressed size */ - subl input_len(%ebp), %ebx - movl output_len(%ebp), %eax - addl %eax, %ebx - /* Add 8 bytes for every 32K input block */ - shrl $12, %eax - addl %eax, %ebx - /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ - addl $(32768 + 18 + 4095), %ebx - andl $~4095, %ebx + /* Target address to relocate to for decompression */ + addl $z_extract_offset, %ebx /* * Prepare for entering 64 bit mode @@ -114,7 +113,7 @@ ENTRY(startup_32) /* * Build early 4G boot pagetable */ - /* Initialize Page tables to 0*/ + /* Initialize Page tables to 0 */ leal pgtable(%ebx), %edi xorl %eax, %eax movl $((4096*6)/4), %ecx @@ -155,7 +154,8 @@ ENTRY(startup_32) btsl $_EFER_LME, %eax wrmsr - /* Setup for the jump to 64bit mode + /* + * Setup for the jump to 64bit mode * * When the jump is performend we will be in long mode but * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 @@ -184,7 +184,8 @@ no_longmode: #include "../../kernel/verify_cpu_64.S" - /* Be careful here startup_64 needs to be at a predictable + /* + * Be careful here startup_64 needs to be at a predictable * address so I can export it in an ELF header. Bootloaders * should look at the ELF header to find this address, as * it may change in the future. @@ -192,7 +193,8 @@ no_longmode: .code64 .org 0x200 ENTRY(startup_64) - /* We come here either from startup_32 or directly from a + /* + * We come here either from startup_32 or directly from a * 64bit bootloader. If we come here from a bootloader we depend on * an identity mapped page table being provied that maps our * entire text+data+bss and hopefully all of memory. @@ -209,50 +211,54 @@ ENTRY(startup_64) movl $0x20, %eax ltr %ax - /* Compute the decompressed kernel start address. It is where + /* + * Compute the decompressed kernel start address. It is where * we were loaded at aligned to a 2M boundary. %rbp contains the * decompressed kernel start address. * * If it is a relocatable kernel then decompress and run the kernel * from load address aligned to 2MB addr, otherwise decompress and - * run the kernel from CONFIG_PHYSICAL_START + * run the kernel from LOAD_PHYSICAL_ADDR + * + * We cannot rely on the calculation done in 32-bit mode, since we + * may have been invoked via the 64-bit entry point. */ /* Start with the delta to where the kernel will run at. */ #ifdef CONFIG_RELOCATABLE leaq startup_32(%rip) /* - $startup_32 */, %rbp - addq $(PMD_PAGE_SIZE - 1), %rbp - andq $PMD_PAGE_MASK, %rbp - movq %rbp, %rbx + movl BP_kernel_alignment(%rsi), %eax + decl %eax + addq %rax, %rbp + notq %rax + andq %rax, %rbp #else - movq $CONFIG_PHYSICAL_START, %rbp - movq %rbp, %rbx + movq $LOAD_PHYSICAL_ADDR, %rbp #endif - /* Replace the compressed data size with the uncompressed size */ - movl input_len(%rip), %eax - subq %rax, %rbx - movl output_len(%rip), %eax - addq %rax, %rbx - /* Add 8 bytes for every 32K input block */ - shrq $12, %rax - addq %rax, %rbx - /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ - addq $(32768 + 18 + 4095), %rbx - andq $~4095, %rbx - -/* Copy the compressed kernel to the end of our buffer + /* Target address to relocate to for decompression */ + leaq z_extract_offset(%rbp), %rbx + + /* Set up the stack */ + leaq boot_stack_end(%rbx), %rsp + + /* Zero EFLAGS */ + pushq $0 + popfq + +/* + * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ - leaq _end_before_pgt(%rip), %r8 - leaq _end_before_pgt(%rbx), %r9 - movq $_end_before_pgt /* - $startup_32 */, %rcx -1: subq $8, %r8 - subq $8, %r9 - movq 0(%r8), %rax - movq %rax, 0(%r9) - subq $8, %rcx - jnz 1b + pushq %rsi + leaq (_bss-8)(%rip), %rsi + leaq (_bss-8)(%rbx), %rdi + movq $_bss /* - $startup_32 */, %rcx + shrq $3, %rcx + std + rep movsq + cld + popq %rsi /* * Jump to the relocated address. @@ -260,37 +266,28 @@ ENTRY(startup_64) leaq relocated(%rbx), %rax jmp *%rax -.section ".text" + .text relocated: /* - * Clear BSS + * Clear BSS (stack is currently empty) */ - xorq %rax, %rax - leaq _edata(%rbx), %rdi - leaq _end_before_pgt(%rbx), %rcx + xorl %eax, %eax + leaq _bss(%rip), %rdi + leaq _ebss(%rip), %rcx subq %rdi, %rcx - cld - rep - stosb - - /* Setup the stack */ - leaq boot_stack_end(%rip), %rsp - - /* zero EFLAGS after setting rsp */ - pushq $0 - popfq + shrq $3, %rcx + rep stosq /* * Do the decompression, and jump to the new kernel.. */ - pushq %rsi # Save the real mode argument - movq %rsi, %rdi # real mode address - leaq boot_heap(%rip), %rsi # malloc area for uncompression - leaq input_data(%rip), %rdx # input_data - movl input_len(%rip), %eax - movq %rax, %rcx # input_len - movq %rbp, %r8 # output + pushq %rsi /* Save the real mode argument */ + movq %rsi, %rdi /* real mode address */ + leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ + leaq input_data(%rip), %rdx /* input_data */ + movl $z_input_len, %ecx /* input_len */ + movq %rbp, %r8 /* output target address */ call decompress_kernel popq %rsi @@ -311,11 +308,21 @@ gdt: .quad 0x0000000000000000 /* TS continued */ gdt_end: -.bss -/* Stack and heap for uncompression */ -.balign 4 +/* + * Stack and heap for uncompression + */ + .bss + .balign 4 boot_heap: .fill BOOT_HEAP_SIZE, 1, 0 boot_stack: .fill BOOT_STACK_SIZE, 1, 0 boot_stack_end: + +/* + * Space for page tables (not in .bss so not zeroed) + */ + .section ".pgtable","a",@nobits + .balign 4096 +pgtable: + .fill 6*4096, 1, 0 diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index e45be73684f..842b2a36174 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -325,21 +325,19 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; + if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) + error("Destination address inappropriately aligned"); #ifdef CONFIG_X86_64 - if ((unsigned long)output & (__KERNEL_ALIGN - 1)) - error("Destination address not 2M aligned"); - if ((unsigned long)output >= 0xffffffffffUL) + if (heap > 0x3fffffffffffUL) error("Destination address too large"); #else - if ((u32)output & (CONFIG_PHYSICAL_ALIGN - 1)) - error("Destination address not CONFIG_PHYSICAL_ALIGN aligned"); if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff)) error("Destination address too large"); +#endif #ifndef CONFIG_RELOCATABLE - if ((u32)output != LOAD_PHYSICAL_ADDR) + if ((unsigned long)output != LOAD_PHYSICAL_ADDR) error("Wrong destination address"); #endif -#endif if (!quiet) putstr("\nDecompressing Linux... "); diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c new file mode 100644 index 00000000000..bcbd36c4143 --- /dev/null +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -0,0 +1,97 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright (C) 2009 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * H. Peter Anvin <hpa@linux.intel.com> + * + * ----------------------------------------------------------------------- */ + +/* + * Compute the desired load offset from a compressed program; outputs + * a small assembly wrapper with the appropriate symbols defined. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <inttypes.h> + +static uint32_t getle32(const void *p) +{ + const uint8_t *cp = p; + + return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) + + ((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24); +} + +int main(int argc, char *argv[]) +{ + uint32_t olen; + long ilen; + unsigned long offs; + FILE *f; + + if (argc < 2) { + fprintf(stderr, "Usage: %s compressed_file\n", argv[0]); + return 1; + } + + /* Get the information for the compressed kernel image first */ + + f = fopen(argv[1], "r"); + if (!f) { + perror(argv[1]); + return 1; + } + + + if (fseek(f, -4L, SEEK_END)) { + perror(argv[1]); + } + fread(&olen, sizeof olen, 1, f); + ilen = ftell(f); + olen = getle32(&olen); + fclose(f); + + /* + * Now we have the input (compressed) and output (uncompressed) + * sizes, compute the necessary decompression offset... + */ + + offs = (olen > ilen) ? olen - ilen : 0; + offs += olen >> 12; /* Add 8 bytes for each 32K block */ + offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */ + offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ + + printf(".section \".rodata.compressed\",\"a\",@progbits\n"); + printf(".globl z_input_len\n"); + printf("z_input_len = %lu\n", ilen); + printf(".globl z_output_len\n"); + printf("z_output_len = %lu\n", (unsigned long)olen); + printf(".globl z_extract_offset\n"); + printf("z_extract_offset = 0x%lx\n", offs); + /* z_extract_offset_negative allows simplification of head_32.S */ + printf(".globl z_extract_offset_negative\n"); + printf("z_extract_offset_negative = -0x%lx\n", offs); + + printf(".globl input_data, input_data_end\n"); + printf("input_data:\n"); + printf(".incbin \"%s\"\n", argv[1]); + printf("input_data_end:\n"); + + return 0; +} diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux.lds.S index bef1ac891bc..cc353e1b3ff 100644 --- a/arch/x86/boot/compressed/vmlinux_64.lds +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -1,6 +1,17 @@ -OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) + +#undef i386 + +#include <asm/page_types.h> + +#ifdef CONFIG_X86_64 OUTPUT_ARCH(i386:x86-64) ENTRY(startup_64) +#else +OUTPUT_ARCH(i386) +ENTRY(startup_32) +#endif + SECTIONS { /* Be careful parts of head_64.S assume startup_32 is at @@ -33,16 +44,22 @@ SECTIONS *(.data.*) _edata = . ; } + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); .bss : { _bss = . ; *(.bss) *(.bss.*) *(COMMON) - . = ALIGN(8); - _end_before_pgt = . ; - . = ALIGN(4096); - pgtable = . ; - . = . + 4096 * 6; + . = ALIGN(8); /* For convenience during zeroing */ _ebss = .; } +#ifdef CONFIG_X86_64 + . = ALIGN(PAGE_SIZE); + .pgtable : { + _pgtable = . ; + *(.pgtable) + _epgtable = . ; + } +#endif + _end = .; } diff --git a/arch/x86/boot/compressed/vmlinux.scr b/arch/x86/boot/compressed/vmlinux.scr deleted file mode 100644 index f02382ae5c4..00000000000 --- a/arch/x86/boot/compressed/vmlinux.scr +++ /dev/null @@ -1,10 +0,0 @@ -SECTIONS -{ - .rodata.compressed : { - input_len = .; - LONG(input_data_end - input_data) input_data = .; - *(.data) - output_len = . - 4; - input_data_end = .; - } -} diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds deleted file mode 100644 index bb3c48379c4..00000000000 --- a/arch/x86/boot/compressed/vmlinux_32.lds +++ /dev/null @@ -1,43 +0,0 @@ -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(startup_32) -SECTIONS -{ - /* Be careful parts of head_32.S assume startup_32 is at - * address 0. - */ - . = 0; - .text.head : { - _head = . ; - *(.text.head) - _ehead = . ; - } - .rodata.compressed : { - *(.rodata.compressed) - } - .text : { - _text = .; /* Text */ - *(.text) - *(.text.*) - _etext = . ; - } - .rodata : { - _rodata = . ; - *(.rodata) /* read-only data */ - *(.rodata.*) - _erodata = . ; - } - .data : { - _data = . ; - *(.data) - *(.data.*) - _edata = . ; - } - .bss : { - _bss = . ; - *(.bss) - *(.bss.*) - *(COMMON) - _end = . ; - } -} diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index 1aae8f3e5ca..c501a5b466f 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -22,17 +23,17 @@ */ static int read_mbr(u8 devno, void *buf) { - u16 ax, bx, cx, dx; + struct biosregs ireg, oreg; - ax = 0x0201; /* Legacy Read, one sector */ - cx = 0x0001; /* Sector 0-0-1 */ - dx = devno; - bx = (size_t)buf; - asm volatile("pushfl; stc; int $0x13; setc %%al; popfl" - : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx) - : : "esi", "edi", "memory"); + initregs(&ireg); + ireg.ax = 0x0201; /* Legacy Read, one sector */ + ireg.cx = 0x0001; /* Sector 0-0-1 */ + ireg.dl = devno; + ireg.bx = (size_t)buf; - return -(u8)ax; /* 0 or -1 */ + intcall(0x13, &ireg, &oreg); + + return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */ } static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) @@ -72,56 +73,46 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) static int get_edd_info(u8 devno, struct edd_info *ei) { - u16 ax, bx, cx, dx, di; + struct biosregs ireg, oreg; memset(ei, 0, sizeof *ei); /* Check Extensions Present */ - ax = 0x4100; - bx = EDDMAGIC1; - dx = devno; - asm("pushfl; stc; int $0x13; setc %%al; popfl" - : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx) - : : "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x41; + ireg.bx = EDDMAGIC1; + ireg.dl = devno; + intcall(0x13, &ireg, &oreg); - if ((u8)ax) + if (oreg.eflags & X86_EFLAGS_CF) return -1; /* No extended information */ - if (bx != EDDMAGIC2) + if (oreg.bx != EDDMAGIC2) return -1; ei->device = devno; - ei->version = ax >> 8; /* EDD version number */ - ei->interface_support = cx; /* EDD functionality subsets */ + ei->version = oreg.ah; /* EDD version number */ + ei->interface_support = oreg.cx; /* EDD functionality subsets */ /* Extended Get Device Parameters */ ei->params.length = sizeof(ei->params); - ax = 0x4800; - dx = devno; - asm("pushfl; int $0x13; popfl" - : "+a" (ax), "+d" (dx), "=m" (ei->params) - : "S" (&ei->params) - : "ebx", "ecx", "edi"); + ireg.ah = 0x48; + ireg.si = (size_t)&ei->params; + intcall(0x13, &ireg, &oreg); /* Get legacy CHS parameters */ /* Ralf Brown recommends setting ES:DI to 0:0 */ - ax = 0x0800; - dx = devno; - di = 0; - asm("pushw %%es; " - "movw %%di,%%es; " - "pushfl; stc; int $0x13; setc %%al; popfl; " - "popw %%es" - : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di) - : : "esi"); - - if ((u8)ax == 0) { - ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2); - ei->legacy_max_head = dx >> 8; - ei->legacy_sectors_per_track = cx & 0x3f; + ireg.ah = 0x08; + ireg.es = 0; + intcall(0x13, &ireg, &oreg); + + if (!(oreg.eflags & X86_EFLAGS_CF)) { + ei->legacy_max_cylinder = oreg.ch + ((oreg.cl & 0xc0) << 2); + ei->legacy_max_head = oreg.dh; + ei->legacy_sectors_per_track = oreg.cl & 0x3f; } return 0; diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 5d84d1c74e4..b31cc54b464 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -22,7 +22,8 @@ #include <asm/page_types.h> #include <asm/setup.h> #include "boot.h" -#include "offsets.h" +#include "voffset.h" +#include "zoffset.h" BOOTSEG = 0x07C0 /* original address of boot-sector */ SYSSEG = 0x1000 /* historical load address >> 4 */ @@ -115,7 +116,7 @@ _start: # Part 2 of the header, from the old setup.S .ascii "HdrS" # header signature - .word 0x0209 # header version number (>= 0x0105) + .word 0x020a # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) .globl realmode_swtch realmode_swtch: .word 0, 0 # default_switch, SETUPSEG @@ -168,7 +169,11 @@ heap_end_ptr: .word _end+STACK_SIZE-512 # end of setup code can be used by setup # for local heap purposes. -pad1: .word 0 +ext_loader_ver: + .byte 0 # Extended boot loader version +ext_loader_type: + .byte 0 # Extended boot loader type + cmd_line_ptr: .long 0 # (Header version 0x0202 or later) # If nonzero, a 32-bit pointer # to the kernel command line. @@ -200,7 +205,7 @@ relocatable_kernel: .byte 1 #else relocatable_kernel: .byte 0 #endif -pad2: .byte 0 +min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment pad3: .word 0 cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, @@ -212,16 +217,27 @@ hardware_subarch: .long 0 # subarchitecture, added with 2.07 hardware_subarch_data: .quad 0 -payload_offset: .long input_data -payload_length: .long input_data_end-input_data +payload_offset: .long ZO_input_data +payload_length: .long ZO_z_input_len setup_data: .quad 0 # 64-bit physical pointer to # single linked list of # struct setup_data +pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr + +#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset) +#define VO_INIT_SIZE (VO__end - VO__text) +#if ZO_INIT_SIZE > VO_INIT_SIZE +#define INIT_SIZE ZO_INIT_SIZE +#else +#define INIT_SIZE VO_INIT_SIZE +#endif +init_size: .long INIT_SIZE # kernel initialization size + # End of setup header ##################################################### - .section ".inittext", "ax" + .section ".entrytext", "ax" start_of_setup: #ifdef SAFE_RESET_DISK_CONTROLLER # Reset the disk controller. diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 58f0415d3ae..140172b895b 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -61,11 +62,10 @@ static void copy_boot_params(void) */ static void keyboard_set_repeat(void) { - u16 ax = 0x0305; - u16 bx = 0; - asm volatile("int $0x16" - : "+a" (ax), "+b" (bx) - : : "ecx", "edx", "esi", "edi"); + struct biosregs ireg; + initregs(&ireg); + ireg.ax = 0x0305; + intcall(0x16, &ireg, NULL); } /* @@ -73,18 +73,22 @@ static void keyboard_set_repeat(void) */ static void query_ist(void) { + struct biosregs ireg, oreg; + /* Some older BIOSes apparently crash on this call, so filter it from machines too old to have SpeedStep at all. */ if (cpu.level < 6) return; - asm("int $0x15" - : "=a" (boot_params.ist_info.signature), - "=b" (boot_params.ist_info.command), - "=c" (boot_params.ist_info.event), - "=d" (boot_params.ist_info.perf_level) - : "a" (0x0000e980), /* IST Support */ - "d" (0x47534943)); /* Request value */ + initregs(&ireg); + ireg.ax = 0xe980; /* IST Support */ + ireg.edx = 0x47534943; /* Request value */ + intcall(0x15, &ireg, &oreg); + + boot_params.ist_info.signature = oreg.eax; + boot_params.ist_info.command = oreg.ebx; + boot_params.ist_info.event = oreg.ecx; + boot_params.ist_info.perf_level = oreg.edx; } /* @@ -93,13 +97,12 @@ static void query_ist(void) static void set_bios_mode(void) { #ifdef CONFIG_X86_64 - u32 eax, ebx; + struct biosregs ireg; - eax = 0xec00; - ebx = 2; - asm volatile("int $0x15" - : "+a" (eax), "+b" (ebx) - : : "ecx", "edx", "esi", "edi"); + initregs(&ireg); + ireg.ax = 0xec00; + ireg.bx = 2; + intcall(0x15, &ireg, NULL); #endif } diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c index 911eaae5d69..a95a531148e 100644 --- a/arch/x86/boot/mca.c +++ b/arch/x86/boot/mca.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -16,26 +17,22 @@ int query_mca(void) { - u8 err; - u16 es, bx, len; - - asm("pushw %%es ; " - "int $0x15 ; " - "setc %0 ; " - "movw %%es, %1 ; " - "popw %%es" - : "=acd" (err), "=acdSD" (es), "=b" (bx) - : "a" (0xc000)); - - if (err) + struct biosregs ireg, oreg; + u16 len; + + initregs(&ireg); + ireg.ah = 0xc0; + intcall(0x15, &ireg, &oreg); + + if (oreg.eflags & X86_EFLAGS_CF) return -1; /* No MCA present */ - set_fs(es); - len = rdfs16(bx); + set_fs(oreg.es); + len = rdfs16(oreg.bx); if (len > sizeof(boot_params.sys_desc_table)) len = sizeof(boot_params.sys_desc_table); - copy_from_fs(&boot_params.sys_desc_table, bx, len); + copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len); return 0; } diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index 74b3d2ba84e..cae3feb1035 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -20,12 +20,16 @@ static int detect_memory_e820(void) { int count = 0; - u32 next = 0; - u32 size, id, edi; - u8 err; + struct biosregs ireg, oreg; struct e820entry *desc = boot_params.e820_map; static struct e820entry buf; /* static so it is zeroed */ + initregs(&ireg); + ireg.ax = 0xe820; + ireg.cx = sizeof buf; + ireg.edx = SMAP; + ireg.di = (size_t)&buf; + /* * Note: at least one BIOS is known which assumes that the * buffer pointed to by one e820 call is the same one as @@ -41,22 +45,13 @@ static int detect_memory_e820(void) */ do { - size = sizeof buf; - - /* Important: %edx and %esi are clobbered by some BIOSes, - so they must be either used for the error output - or explicitly marked clobbered. Given that, assume there - is something out there clobbering %ebp and %edi, too. */ - asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0" - : "=d" (err), "+b" (next), "=a" (id), "+c" (size), - "=D" (edi), "+m" (buf) - : "D" (&buf), "d" (SMAP), "a" (0xe820) - : "esi"); + intcall(0x15, &ireg, &oreg); + ireg.ebx = oreg.ebx; /* for next iteration... */ /* BIOSes which terminate the chain with CF = 1 as opposed to %ebx = 0 don't always report the SMAP signature on the final, failing, probe. */ - if (err) + if (oreg.eflags & X86_EFLAGS_CF) break; /* Some BIOSes stop returning SMAP in the middle of @@ -64,60 +59,64 @@ static int detect_memory_e820(void) screwed up the map at that point, we might have a partial map, the full map, or complete garbage, so just return failure. */ - if (id != SMAP) { + if (oreg.eax != SMAP) { count = 0; break; } *desc++ = buf; count++; - } while (next && count < ARRAY_SIZE(boot_params.e820_map)); + } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map)); return boot_params.e820_entries = count; } static int detect_memory_e801(void) { - u16 ax, bx, cx, dx; - u8 err; + struct biosregs ireg, oreg; - bx = cx = dx = 0; - ax = 0xe801; - asm("stc; int $0x15; setc %0" - : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx)); + initregs(&ireg); + ireg.ax = 0xe801; + intcall(0x15, &ireg, &oreg); - if (err) + if (oreg.eflags & X86_EFLAGS_CF) return -1; /* Do we really need to do this? */ - if (cx || dx) { - ax = cx; - bx = dx; + if (oreg.cx || oreg.dx) { + oreg.ax = oreg.cx; + oreg.bx = oreg.dx; } - if (ax > 15*1024) + if (oreg.ax > 15*1024) { return -1; /* Bogus! */ - - /* This ignores memory above 16MB if we have a memory hole - there. If someone actually finds a machine with a memory - hole at 16MB and no support for 0E820h they should probably - generate a fake e820 map. */ - boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax; + } else if (oreg.ax == 15*1024) { + boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax; + } else { + /* + * This ignores memory above 16MB if we have a memory + * hole there. If someone actually finds a machine + * with a memory hole at 16MB and no support for + * 0E820h they should probably generate a fake e820 + * map. + */ + boot_params.alt_mem_k = oreg.ax; + } return 0; } static int detect_memory_88(void) { - u16 ax; - u8 err; + struct biosregs ireg, oreg; - ax = 0x8800; - asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax)); + initregs(&ireg); + ireg.ah = 0x88; + intcall(0x15, &ireg, &oreg); - boot_params.screen_info.ext_mem_k = ax; + boot_params.screen_info.ext_mem_k = oreg.ax; - return -err; + return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */ } int detect_memory(void) diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c new file mode 100644 index 00000000000..958019b1cfa --- /dev/null +++ b/arch/x86/boot/regs.c @@ -0,0 +1,29 @@ +/* ----------------------------------------------------------------------- + * + * Copyright 2009 Intel Corporation; author H. Peter Anvin + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * Simple helper function for initializing a register set. + * + * Note that this sets EFLAGS_CF in the input register set; this + * makes it easier to catch functions which do nothing but don't + * explicitly set CF. + */ + +#include "boot.h" + +void initregs(struct biosregs *reg) +{ + memset(reg, 0, sizeof *reg); + reg->eflags |= X86_EFLAGS_CF; + reg->ds = ds(); + reg->es = ds(); + reg->fs = fs(); + reg->gs = gs(); +} diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index bb8dc2de796..0f6ec455a2b 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -15,8 +15,11 @@ SECTIONS . = 497; .header : { *(.header) } + .entrytext : { *(.entrytext) } .inittext : { *(.inittext) } .initdata : { *(.initdata) } + __end_init = .; + .text : { *(.text) } .text32 : { *(.text32) } @@ -52,4 +55,7 @@ SECTIONS . = ASSERT(_end <= 0x8000, "Setup too big!"); . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!"); + /* Necessary for the very-old-loader check to work... */ + . = ASSERT(__end_init <= 5*512, "init sections too big!"); + } diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c index 7e8e8b25f5f..01ec69c901c 100644 --- a/arch/x86/boot/tty.c +++ b/arch/x86/boot/tty.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -22,24 +23,23 @@ void __attribute__((section(".inittext"))) putchar(int ch) { - unsigned char c = ch; + struct biosregs ireg; - if (c == '\n') + if (ch == '\n') putchar('\r'); /* \n -> \r\n */ - /* int $0x10 is known to have bugs involving touching registers - it shouldn't. Be extra conservative... */ - asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal" - : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch)); + initregs(&ireg); + ireg.bx = 0x0007; + ireg.cx = 0x0001; + ireg.ah = 0x0e; + ireg.al = ch; + intcall(0x10, &ireg, NULL); } void __attribute__((section(".inittext"))) puts(const char *str) { - int n = 0; - while (*str) { + while (*str) putchar(*str++); - n++; - } } /* @@ -49,14 +49,13 @@ void __attribute__((section(".inittext"))) puts(const char *str) static u8 gettime(void) { - u16 ax = 0x0200; - u16 cx, dx; + struct biosregs ireg, oreg; - asm volatile("int $0x1a" - : "+a" (ax), "=c" (cx), "=d" (dx) - : : "ebx", "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x02; + intcall(0x1a, &ireg, &oreg); - return dx >> 8; + return oreg.dh; } /* @@ -64,19 +63,24 @@ static u8 gettime(void) */ int getchar(void) { - u16 ax = 0; - asm volatile("int $0x16" : "+a" (ax)); + struct biosregs ireg, oreg; + + initregs(&ireg); + /* ireg.ah = 0x00; */ + intcall(0x16, &ireg, &oreg); - return ax & 0xff; + return oreg.al; } static int kbd_pending(void) { - u8 pending; - asm volatile("int $0x16; setnz %0" - : "=qm" (pending) - : "a" (0x0100)); - return pending; + struct biosregs ireg, oreg; + + initregs(&ireg); + ireg.ah = 0x01; + intcall(0x16, &ireg, &oreg); + + return !(oreg.eflags & X86_EFLAGS_ZF); } void kbd_flush(void) diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c index 3fa979c9c36..49e0c18833e 100644 --- a/arch/x86/boot/video-bios.c +++ b/arch/x86/boot/video-bios.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -29,21 +30,20 @@ static int bios_set_mode(struct mode_info *mi) static int set_bios_mode(u8 mode) { - u16 ax; + struct biosregs ireg, oreg; u8 new_mode; - ax = mode; /* AH=0x00 Set Video Mode */ - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); + initregs(&ireg); + ireg.al = mode; /* AH=0x00 Set Video Mode */ + intcall(0x10, &ireg, NULL); - ax = 0x0f00; /* Get Current Video Mode */ - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); + ireg.ah = 0x0f; /* Get Current Video Mode */ + intcall(0x10, &ireg, &oreg); do_restore = 1; /* Assume video contents were lost */ - new_mode = ax & 0x7f; /* Not all BIOSes are clean with the top bit */ + + /* Not all BIOSes are clean with the top bit */ + new_mode = oreg.al & 0x7f; if (new_mode == mode) return 0; /* Mode change OK */ @@ -53,10 +53,8 @@ static int set_bios_mode(u8 mode) /* Mode setting failed, but we didn't end up where we started. That's bad. Try to revert to the original video mode. */ - ax = boot_params.screen_info.orig_video_mode; - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); + ireg.ax = boot_params.screen_info.orig_video_mode; + intcall(0x10, &ireg, NULL); } #endif return -1; diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 4a58c8ce3f6..275dd177f19 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -31,7 +32,7 @@ static inline void vesa_store_mode_params_graphics(void) {} static int vesa_probe(void) { #if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID) - u16 ax, cx, di; + struct biosregs ireg, oreg; u16 mode; addr_t mode_ptr; struct mode_info *mi; @@ -39,13 +40,12 @@ static int vesa_probe(void) video_vesa.modes = GET_HEAP(struct mode_info, 0); - ax = 0x4f00; - di = (size_t)&vginfo; - asm(INT10 - : "+a" (ax), "+D" (di), "=m" (vginfo) - : : "ebx", "ecx", "edx", "esi"); + initregs(&ireg); + ireg.ax = 0x4f00; + ireg.di = (size_t)&vginfo; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f || + if (oreg.ax != 0x004f || vginfo.signature != VESA_MAGIC || vginfo.version < 0x0102) return 0; /* Not present */ @@ -65,14 +65,12 @@ static int vesa_probe(void) memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ - ax = 0x4f01; - cx = mode; - di = (size_t)&vminfo; - asm(INT10 - : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo) - : : "ebx", "edx", "esi"); + ireg.ax = 0x4f01; + ireg.cx = mode; + ireg.di = (size_t)&vminfo; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (oreg.ax != 0x004f) continue; if ((vminfo.mode_attr & 0x15) == 0x05) { @@ -111,20 +109,19 @@ static int vesa_probe(void) static int vesa_set_mode(struct mode_info *mode) { - u16 ax, bx, cx, di; + struct biosregs ireg, oreg; int is_graphic; u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA; memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ - ax = 0x4f01; - cx = vesa_mode; - di = (size_t)&vminfo; - asm(INT10 - : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo) - : : "ebx", "edx", "esi"); + initregs(&ireg); + ireg.ax = 0x4f01; + ireg.cx = vesa_mode; + ireg.di = (size_t)&vminfo; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (oreg.ax != 0x004f) return -1; if ((vminfo.mode_attr & 0x15) == 0x05) { @@ -141,14 +138,12 @@ static int vesa_set_mode(struct mode_info *mode) } - ax = 0x4f02; - bx = vesa_mode; - di = 0; - asm volatile(INT10 - : "+a" (ax), "+b" (bx), "+D" (di) - : : "ecx", "edx", "esi"); + initregs(&ireg); + ireg.ax = 0x4f02; + ireg.bx = vesa_mode; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (oreg.ax != 0x004f) return -1; graphic_mode = is_graphic; @@ -171,50 +166,45 @@ static int vesa_set_mode(struct mode_info *mode) /* Switch DAC to 8-bit mode */ static void vesa_dac_set_8bits(void) { + struct biosregs ireg, oreg; u8 dac_size = 6; /* If possible, switch the DAC to 8-bit mode */ if (vginfo.capabilities & 1) { - u16 ax, bx; - - ax = 0x4f08; - bx = 0x0800; - asm volatile(INT10 - : "+a" (ax), "+b" (bx) - : : "ecx", "edx", "esi", "edi"); - - if (ax == 0x004f) - dac_size = bx >> 8; + initregs(&ireg); + ireg.ax = 0x4f08; + ireg.bh = 0x08; + intcall(0x10, &ireg, &oreg); + if (oreg.ax == 0x004f) + dac_size = oreg.bh; } /* Set the color sizes to the DAC size, and offsets to 0 */ - boot_params.screen_info.red_size = dac_size; + boot_params.screen_info.red_size = dac_size; boot_params.screen_info.green_size = dac_size; - boot_params.screen_info.blue_size = dac_size; - boot_params.screen_info.rsvd_size = dac_size; + boot_params.screen_info.blue_size = dac_size; + boot_params.screen_info.rsvd_size = dac_size; - boot_params.screen_info.red_pos = 0; - boot_params.screen_info.green_pos = 0; - boot_params.screen_info.blue_pos = 0; - boot_params.screen_info.rsvd_pos = 0; + boot_params.screen_info.red_pos = 0; + boot_params.screen_info.green_pos = 0; + boot_params.screen_info.blue_pos = 0; + boot_params.screen_info.rsvd_pos = 0; } /* Save the VESA protected mode info */ static void vesa_store_pm_info(void) { - u16 ax, bx, di, es; + struct biosregs ireg, oreg; - ax = 0x4f0a; - bx = di = 0; - asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es" - : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di) - : : "ecx", "esi"); + initregs(&ireg); + ireg.ax = 0x4f0a; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (oreg.ax != 0x004f) return; - boot_params.screen_info.vesapm_seg = es; - boot_params.screen_info.vesapm_off = di; + boot_params.screen_info.vesapm_seg = oreg.es; + boot_params.screen_info.vesapm_off = oreg.di; } /* @@ -252,7 +242,7 @@ static void vesa_store_mode_params_graphics(void) void vesa_store_edid(void) { #ifdef CONFIG_FIRMWARE_EDID - u16 ax, bx, cx, dx, di; + struct biosregs ireg, oreg; /* Apparently used as a nonsense token... */ memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info); @@ -260,33 +250,26 @@ void vesa_store_edid(void) if (vginfo.version < 0x0200) return; /* EDID requires VBE 2.0+ */ - ax = 0x4f15; /* VBE DDC */ - bx = 0x0000; /* Report DDC capabilities */ - cx = 0; /* Controller 0 */ - di = 0; /* ES:DI must be 0 by spec */ - - /* Note: The VBE DDC spec is different from the main VESA spec; - we genuinely have to assume all registers are destroyed here. */ - - asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es" - : "+a" (ax), "+b" (bx), "+c" (cx), "+D" (di) - : : "esi", "edx"); + initregs(&ireg); + ireg.ax = 0x4f15; /* VBE DDC */ + /* ireg.bx = 0x0000; */ /* Report DDC capabilities */ + /* ireg.cx = 0; */ /* Controller 0 */ + ireg.es = 0; /* ES:DI must be 0 by spec */ + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (oreg.ax != 0x004f) return; /* No EDID */ /* BH = time in seconds to transfer EDD information */ /* BL = DDC level supported */ - ax = 0x4f15; /* VBE DDC */ - bx = 0x0001; /* Read EDID */ - cx = 0; /* Controller 0 */ - dx = 0; /* EDID block number */ - di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */ - asm(INT10 - : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info), - "+c" (cx), "+D" (di) - : : "esi"); + ireg.ax = 0x4f15; /* VBE DDC */ + ireg.bx = 0x0001; /* Read EDID */ + /* ireg.cx = 0; */ /* Controller 0 */ + /* ireg.dx = 0; */ /* EDID block number */ + ireg.es = ds(); + ireg.di =(size_t)&boot_params.edid_info; /* (ES:)Pointer to block */ + intcall(0x10, &ireg, &oreg); #endif /* CONFIG_FIRMWARE_EDID */ } diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index 9e0587a3776..8f8d827e254 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -39,30 +40,30 @@ static __videocard video_vga; /* Set basic 80x25 mode */ static u8 vga_set_basic_mode(void) { + struct biosregs ireg, oreg; u16 ax; u8 rows; u8 mode; + initregs(&ireg); + #ifdef CONFIG_VIDEO_400_HACK if (adapter >= ADAPTER_VGA) { - asm volatile(INT10 - : : "a" (0x1202), "b" (0x0030) - : "ecx", "edx", "esi", "edi"); + ireg.ax = 0x1202; + ireg.bx = 0x0030; + intcall(0x10, &ireg, NULL); } #endif ax = 0x0f00; - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); - - mode = (u8)ax; + intcall(0x10, &ireg, &oreg); + mode = oreg.al; set_fs(0); rows = rdfs8(0x484); /* rows minus one */ #ifndef CONFIG_VIDEO_400_HACK - if ((ax == 0x5003 || ax == 0x5007) && + if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) && (rows == 0 || rows == 24)) return mode; #endif @@ -71,10 +72,8 @@ static u8 vga_set_basic_mode(void) mode = 3; /* Set the mode */ - ax = mode; - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); + ireg.ax = mode; /* AH=0: set mode */ + intcall(0x10, &ireg, NULL); do_restore = 1; return mode; } @@ -82,43 +81,69 @@ static u8 vga_set_basic_mode(void) static void vga_set_8font(void) { /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */ + struct biosregs ireg; + + initregs(&ireg); /* Set 8x8 font */ - asm volatile(INT10 : : "a" (0x1112), "b" (0)); + ireg.ax = 0x1112; + /* ireg.bl = 0; */ + intcall(0x10, &ireg, NULL); /* Use alternate print screen */ - asm volatile(INT10 : : "a" (0x1200), "b" (0x20)); + ireg.ax = 0x1200; + ireg.bl = 0x20; + intcall(0x10, &ireg, NULL); /* Turn off cursor emulation */ - asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); + ireg.ax = 0x1201; + ireg.bl = 0x34; + intcall(0x10, &ireg, NULL); /* Cursor is scan lines 6-7 */ - asm volatile(INT10 : : "a" (0x0100), "c" (0x0607)); + ireg.ax = 0x0100; + ireg.cx = 0x0607; + intcall(0x10, &ireg, NULL); } static void vga_set_14font(void) { /* Set 9x14 font - 80x28 on VGA */ + struct biosregs ireg; + + initregs(&ireg); /* Set 9x14 font */ - asm volatile(INT10 : : "a" (0x1111), "b" (0)); + ireg.ax = 0x1111; + /* ireg.bl = 0; */ + intcall(0x10, &ireg, NULL); /* Turn off cursor emulation */ - asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); + ireg.ax = 0x1201; + ireg.bl = 0x34; + intcall(0x10, &ireg, NULL); /* Cursor is scan lines 11-12 */ - asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c)); + ireg.ax = 0x0100; + ireg.cx = 0x0b0c; + intcall(0x10, &ireg, NULL); } static void vga_set_80x43(void) { /* Set 80x43 mode on VGA (not EGA) */ + struct biosregs ireg; + + initregs(&ireg); /* Set 350 scans */ - asm volatile(INT10 : : "a" (0x1201), "b" (0x30)); + ireg.ax = 0x1201; + ireg.bl = 0x30; + intcall(0x10, &ireg, NULL); /* Reset video mode */ - asm volatile(INT10 : : "a" (0x0003)); + ireg.ax = 0x0003; + intcall(0x10, &ireg, NULL); vga_set_8font(); } @@ -225,8 +250,6 @@ static int vga_set_mode(struct mode_info *mode) */ static int vga_probe(void) { - u16 ega_bx; - static const char *card_name[] = { "CGA/MDA/HGC", "EGA", "VGA" }; @@ -240,26 +263,26 @@ static int vga_probe(void) sizeof(ega_modes)/sizeof(struct mode_info), sizeof(vga_modes)/sizeof(struct mode_info), }; - u8 vga_flag; - asm(INT10 - : "=b" (ega_bx) - : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */ - : "ecx", "edx", "esi", "edi"); + struct biosregs ireg, oreg; + + initregs(&ireg); + + ireg.ax = 0x1200; + ireg.bl = 0x10; /* Check EGA/VGA */ + intcall(0x10, &ireg, &oreg); #ifndef _WAKEUP - boot_params.screen_info.orig_video_ega_bx = ega_bx; + boot_params.screen_info.orig_video_ega_bx = oreg.bx; #endif /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */ - if ((u8)ega_bx != 0x10) { + if (oreg.bl != 0x10) { /* EGA/VGA */ - asm(INT10 - : "=a" (vga_flag) - : "a" (0x1a00) - : "ebx", "ecx", "edx", "esi", "edi"); + ireg.ax = 0x1a00; + intcall(0x10, &ireg, &oreg); - if (vga_flag == 0x1a) { + if (oreg.al == 0x1a) { adapter = ADAPTER_VGA; #ifndef _WAKEUP boot_params.screen_info.orig_video_isVGA = 1; diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index 3bef2c1febe..bad728b76fc 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -18,33 +19,29 @@ static void store_cursor_position(void) { - u16 curpos; - u16 ax, bx; + struct biosregs ireg, oreg; - ax = 0x0300; - bx = 0; - asm(INT10 - : "=d" (curpos), "+a" (ax), "+b" (bx) - : : "ecx", "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x03; + intcall(0x10, &ireg, &oreg); - boot_params.screen_info.orig_x = curpos; - boot_params.screen_info.orig_y = curpos >> 8; + boot_params.screen_info.orig_x = oreg.dl; + boot_params.screen_info.orig_y = oreg.dh; } static void store_video_mode(void) { - u16 ax, page; + struct biosregs ireg, oreg; /* N.B.: the saving of the video page here is a bit silly, since we pretty much assume page 0 everywhere. */ - ax = 0x0f00; - asm(INT10 - : "+a" (ax), "=b" (page) - : : "ecx", "edx", "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x0f; + intcall(0x10, &ireg, &oreg); /* Not all BIOSes are clean with respect to the top bit */ - boot_params.screen_info.orig_video_mode = ax & 0x7f; - boot_params.screen_info.orig_video_page = page >> 8; + boot_params.screen_info.orig_video_mode = oreg.al & 0x7f; + boot_params.screen_info.orig_video_page = oreg.bh; } /* @@ -257,7 +254,7 @@ static void restore_screen(void) int y; addr_t dst = 0; u16 *src = saved.data; - u16 ax, bx, dx; + struct biosregs ireg; if (graphic_mode) return; /* Can't restore onto a graphic mode */ @@ -296,12 +293,11 @@ static void restore_screen(void) } /* Restore cursor position */ - ax = 0x0200; /* Set cursor position */ - bx = 0; /* Page number (<< 8) */ - dx = (saved.cury << 8)+saved.curx; - asm volatile(INT10 - : "+a" (ax), "+b" (bx), "+d" (dx) - : : "ecx", "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x02; /* Set cursor position */ + ireg.dh = saved.cury; + ireg.dl = saved.curx; + intcall(0x10, &ireg, NULL); } #else #define save_screen() ((void)0) diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h index ee63f5d1446..5bb174a997f 100644 --- a/arch/x86/boot/video.h +++ b/arch/x86/boot/video.h @@ -112,20 +112,6 @@ extern int force_x, force_y; /* Don't query the BIOS for cols/rows */ extern int do_restore; /* Restore screen contents */ extern int graphic_mode; /* Graphics mode with linear frame buffer */ -/* - * int $0x10 is notorious for touching registers it shouldn't. - * gcc doesn't like %ebp being clobbered, so define it as a push/pop - * sequence here. - * - * A number of systems, including the original PC can clobber %bp in - * certain circumstances, like when scrolling. There exists at least - * one Trident video card which could clobber DS under a set of - * circumstances that we are unlikely to encounter (scrolling when - * using an extended graphics mode of more than 800x600 pixels), but - * it's cheap insurance to deal with that here. - */ -#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp" - /* Accessing VGA indexed registers */ static inline u8 in_idx(u16 port, u8 index) { diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 235b81d0f6f..edb992ebef9 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1,12 +1,13 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29-rc4 -# Tue Feb 24 15:50:58 2009 +# Linux kernel version: 2.6.30-rc2 +# Mon May 11 16:21:55 2009 # # CONFIG_64BIT is not set CONFIG_X86_32=y # CONFIG_X86_64 is not set CONFIG_X86=y +CONFIG_OUTPUT_FORMAT="elf32-i386" CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig" CONFIG_GENERIC_TIME=y CONFIG_GENERIC_CMOS_UPDATE=y @@ -33,6 +34,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y CONFIG_ARCH_HAS_DEFAULT_IDLE=y CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y # CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set CONFIG_ARCH_HIBERNATION_POSSIBLE=y CONFIG_ARCH_SUSPEND_POSSIBLE=y @@ -40,15 +42,16 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y CONFIG_ARCH_POPULATES_NODE_MAP=y # CONFIG_AUDIT_ARCH is not set CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_GENERIC_PENDING_IRQ=y -CONFIG_X86_SMP=y CONFIG_USE_GENERIC_SMP_HELPERS=y CONFIG_X86_32_SMP=y CONFIG_X86_HT=y -CONFIG_X86_BIOS_REBOOT=y CONFIG_X86_TRAMPOLINE=y +CONFIG_X86_32_LAZY_GS=y CONFIG_KTIME_SCALAR=y CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" @@ -60,10 +63,17 @@ CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 CONFIG_LOCALVERSION="" # CONFIG_LOCALVERSION_AUTO is not set +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_KERNEL_GZIP=y +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set CONFIG_TASKSTATS=y @@ -113,23 +123,26 @@ CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y CONFIG_PCSPKR_PLATFORM=y -# CONFIG_COMPAT_BRK is not set CONFIG_BASE_FULL=y CONFIG_FUTEX=y -CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y CONFIG_TIMERFD=y @@ -139,6 +152,7 @@ CONFIG_AIO=y CONFIG_VM_EVENT_COUNTERS=y CONFIG_PCI_QUIRKS=y CONFIG_SLUB_DEBUG=y +# CONFIG_COMPAT_BRK is not set # CONFIG_SLAB is not set CONFIG_SLUB=y # CONFIG_SLOB is not set @@ -154,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -167,7 +183,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y # CONFIG_LBD is not set -CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BLK_DEV_BSG=y # CONFIG_BLK_DEV_INTEGRITY is not set @@ -194,12 +209,12 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_GENERIC_CLOCKEVENTS_BUILD=y CONFIG_SMP=y CONFIG_SPARSE_IRQ=y -CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y +# CONFIG_X86_BIGSMP is not set +CONFIG_X86_EXTENDED_PLATFORM=y # CONFIG_X86_ELAN is not set -# CONFIG_X86_GENERICARCH is not set -# CONFIG_X86_VSMP is not set # CONFIG_X86_RDC321X is not set +# CONFIG_X86_32_NON_STANDARD is not set CONFIG_SCHED_OMIT_FRAME_POINTER=y # CONFIG_PARAVIRT_GUEST is not set # CONFIG_MEMTEST is not set @@ -230,8 +245,10 @@ CONFIG_M686=y # CONFIG_GENERIC_CPU is not set CONFIG_X86_GENERIC=y CONFIG_X86_CPU=y +CONFIG_X86_L1_CACHE_BYTES=64 +CONFIG_X86_INTERNODE_CACHE_BYTES=64 CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_L1_CACHE_SHIFT=5 CONFIG_X86_XADD=y # CONFIG_X86_PPRO_FENCE is not set CONFIG_X86_WP_WORKS_OK=y @@ -247,7 +264,7 @@ CONFIG_X86_DEBUGCTLMSR=y CONFIG_CPU_SUP_INTEL=y CONFIG_CPU_SUP_CYRIX_32=y CONFIG_CPU_SUP_AMD=y -CONFIG_CPU_SUP_CENTAUR_32=y +CONFIG_CPU_SUP_CENTAUR=y CONFIG_CPU_SUP_TRANSMETA_32=y CONFIG_CPU_SUP_UMC_32=y CONFIG_X86_DS=y @@ -279,6 +296,7 @@ CONFIG_MICROCODE_AMD=y CONFIG_MICROCODE_OLD_INTERFACE=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y +# CONFIG_X86_CPU_DEBUG is not set # CONFIG_NOHIGHMEM is not set CONFIG_HIGHMEM4G=y # CONFIG_HIGHMEM64G is not set @@ -302,6 +320,8 @@ CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_UNEVICTABLE_LRU=y +CONFIG_HAVE_MLOCK=y +CONFIG_HAVE_MLOCKED_PAGE_BIT=y CONFIG_HIGHPTE=y CONFIG_X86_CHECK_BIOS_CORRUPTION=y CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y @@ -312,6 +332,7 @@ CONFIG_MTRR=y CONFIG_X86_PAT=y CONFIG_EFI=y CONFIG_SECCOMP=y +# CONFIG_CC_STACKPROTECTOR is not set # CONFIG_HZ_100 is not set # CONFIG_HZ_250 is not set # CONFIG_HZ_300 is not set @@ -322,8 +343,9 @@ CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y # CONFIG_KEXEC_JUMP is not set CONFIG_PHYSICAL_START=0x1000000 -# CONFIG_RELOCATABLE is not set -CONFIG_PHYSICAL_ALIGN=0x200000 +CONFIG_RELOCATABLE=y +CONFIG_X86_NEED_RELOCS=y +CONFIG_PHYSICAL_ALIGN=0x1000000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set # CONFIG_CMDLINE_BOOL is not set @@ -363,7 +385,6 @@ CONFIG_ACPI_THERMAL=y CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set # CONFIG_ACPI_PCI_SLOT is not set -CONFIG_ACPI_SYSTEM=y CONFIG_X86_PM_TIMER=y CONFIG_ACPI_CONTAINER=y # CONFIG_ACPI_SBS is not set @@ -425,6 +446,7 @@ CONFIG_PCI_BIOS=y CONFIG_PCI_DIRECT=y CONFIG_PCI_MMCONFIG=y CONFIG_PCI_DOMAINS=y +# CONFIG_DMAR is not set CONFIG_PCIEPORTBUS=y # CONFIG_HOTPLUG_PCI_PCIE is not set CONFIG_PCIEAER=y @@ -435,6 +457,7 @@ CONFIG_PCI_MSI=y # CONFIG_PCI_DEBUG is not set # CONFIG_PCI_STUB is not set CONFIG_HT_IRQ=y +# CONFIG_PCI_IOV is not set CONFIG_ISA_DMA_API=y # CONFIG_ISA is not set # CONFIG_MCA is not set @@ -481,7 +504,6 @@ CONFIG_NET=y # # Networking options # -CONFIG_COMPAT_NET_DEV_OPS=y CONFIG_PACKET=y CONFIG_PACKET_MMAP=y CONFIG_UNIX=y @@ -639,6 +661,7 @@ CONFIG_LLC=y # CONFIG_LAPB is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set +# CONFIG_PHONET is not set CONFIG_NET_SCHED=y # @@ -696,6 +719,7 @@ CONFIG_NET_SCH_FIFO=y # # CONFIG_NET_PKTGEN is not set # CONFIG_NET_TCPPROBE is not set +# CONFIG_NET_DROP_MONITOR is not set CONFIG_HAMRADIO=y # @@ -706,12 +730,10 @@ CONFIG_HAMRADIO=y # CONFIG_IRDA is not set # CONFIG_BT is not set # CONFIG_AF_RXRPC is not set -# CONFIG_PHONET is not set CONFIG_FIB_RULES=y CONFIG_WIRELESS=y CONFIG_CFG80211=y # CONFIG_CFG80211_REG_DEBUG is not set -CONFIG_NL80211=y CONFIG_WIRELESS_OLD_REGULATORY=y CONFIG_WIRELESS_EXT=y CONFIG_WIRELESS_EXT_SYSFS=y @@ -789,6 +811,7 @@ CONFIG_MISC_DEVICES=y # CONFIG_ICS932S401 is not set # CONFIG_ENCLOSURE_SERVICES is not set # CONFIG_HP_ILO is not set +# CONFIG_ISL29003 is not set # CONFIG_C2PORT is not set # @@ -842,6 +865,7 @@ CONFIG_SCSI_SPI_ATTRS=y # CONFIG_SCSI_LOWLEVEL is not set # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set # CONFIG_SCSI_DH is not set +# CONFIG_SCSI_OSD_INITIATOR is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set CONFIG_ATA_ACPI=y @@ -940,6 +964,7 @@ CONFIG_DM_ZERO=y CONFIG_MACINTOSH_DRIVERS=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_NETDEVICES=y +CONFIG_COMPAT_NET_DEV_OPS=y # CONFIG_IFB is not set # CONFIG_DUMMY is not set # CONFIG_BONDING is not set @@ -977,6 +1002,8 @@ CONFIG_MII=y CONFIG_NET_VENDOR_3COM=y # CONFIG_VORTEX is not set # CONFIG_TYPHOON is not set +# CONFIG_ETHOC is not set +# CONFIG_DNET is not set CONFIG_NET_TULIP=y # CONFIG_DE2104X is not set # CONFIG_TULIP is not set @@ -1026,6 +1053,7 @@ CONFIG_E1000=y CONFIG_E1000E=y # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -1040,6 +1068,7 @@ CONFIG_BNX2=y # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set # CONFIG_ATL1E is not set +# CONFIG_ATL1C is not set # CONFIG_JME is not set CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set @@ -1049,6 +1078,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -1058,6 +1088,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_BNX2X is not set # CONFIG_QLGE is not set # CONFIG_SFC is not set +# CONFIG_BE2NET is not set CONFIG_TR=y # CONFIG_IBMOL is not set # CONFIG_IBMLS is not set @@ -1073,8 +1104,8 @@ CONFIG_WLAN_80211=y # CONFIG_LIBERTAS is not set # CONFIG_LIBERTAS_THINFIRM is not set # CONFIG_AIRO is not set -# CONFIG_HERMES is not set # CONFIG_ATMEL is not set +# CONFIG_AT76C50X_USB is not set # CONFIG_AIRO_CS is not set # CONFIG_PCMCIA_WL3501 is not set # CONFIG_PRISM54 is not set @@ -1084,21 +1115,21 @@ CONFIG_WLAN_80211=y # CONFIG_RTL8187 is not set # CONFIG_ADM8211 is not set # CONFIG_MAC80211_HWSIM is not set +# CONFIG_MWL8K is not set # CONFIG_P54_COMMON is not set CONFIG_ATH5K=y # CONFIG_ATH5K_DEBUG is not set # CONFIG_ATH9K is not set +# CONFIG_AR9170_USB is not set # CONFIG_IPW2100 is not set # CONFIG_IPW2200 is not set -# CONFIG_IWLCORE is not set -# CONFIG_IWLWIFI_LEDS is not set -# CONFIG_IWLAGN is not set -# CONFIG_IWL3945 is not set +# CONFIG_IWLWIFI is not set # CONFIG_HOSTAP is not set # CONFIG_B43 is not set # CONFIG_B43LEGACY is not set # CONFIG_ZD1211RW is not set # CONFIG_RT2X00 is not set +# CONFIG_HERMES is not set # # Enable WiMAX (Networking options) to see the WiMAX drivers @@ -1209,6 +1240,8 @@ CONFIG_INPUT_TABLET=y # CONFIG_TABLET_USB_KBTAB is not set # CONFIG_TABLET_USB_WACOM is not set CONFIG_INPUT_TOUCHSCREEN=y +# CONFIG_TOUCHSCREEN_AD7879_I2C is not set +# CONFIG_TOUCHSCREEN_AD7879 is not set # CONFIG_TOUCHSCREEN_FUJITSU is not set # CONFIG_TOUCHSCREEN_GUNZE is not set # CONFIG_TOUCHSCREEN_ELO is not set @@ -1303,6 +1336,7 @@ CONFIG_UNIX98_PTYS=y # CONFIG_LEGACY_PTYS is not set # CONFIG_IPMI_HANDLER is not set CONFIG_HW_RANDOM=y +# CONFIG_HW_RANDOM_TIMERIOMEM is not set CONFIG_HW_RANDOM_INTEL=y CONFIG_HW_RANDOM_AMD=y CONFIG_HW_RANDOM_GEODE=y @@ -1390,7 +1424,6 @@ CONFIG_I2C_I801=y # CONFIG_SENSORS_PCF8574 is not set # CONFIG_PCF8575 is not set # CONFIG_SENSORS_PCA9539 is not set -# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_MAX6875 is not set # CONFIG_SENSORS_TSL2550 is not set # CONFIG_I2C_DEBUG_CORE is not set @@ -1424,6 +1457,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_ADT7475 is not set # CONFIG_SENSORS_K8TEMP is not set # CONFIG_SENSORS_ASB100 is not set +# CONFIG_SENSORS_ATK0110 is not set # CONFIG_SENSORS_ATXP1 is not set # CONFIG_SENSORS_DS1621 is not set # CONFIG_SENSORS_I5K_AMB is not set @@ -1433,6 +1467,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_FSCHER is not set # CONFIG_SENSORS_FSCPOS is not set # CONFIG_SENSORS_FSCHMD is not set +# CONFIG_SENSORS_G760A is not set # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_CORETEMP is not set @@ -1448,11 +1483,14 @@ CONFIG_HWMON=y # CONFIG_SENSORS_LM90 is not set # CONFIG_SENSORS_LM92 is not set # CONFIG_SENSORS_LM93 is not set +# CONFIG_SENSORS_LTC4215 is not set # CONFIG_SENSORS_LTC4245 is not set +# CONFIG_SENSORS_LM95241 is not set # CONFIG_SENSORS_MAX1619 is not set # CONFIG_SENSORS_MAX6650 is not set # CONFIG_SENSORS_PC87360 is not set # CONFIG_SENSORS_PC87427 is not set +# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_SIS5595 is not set # CONFIG_SENSORS_DME1737 is not set # CONFIG_SENSORS_SMSC47M1 is not set @@ -1643,7 +1681,6 @@ CONFIG_FB_EFI=y # CONFIG_FB_3DFX is not set # CONFIG_FB_VOODOO1 is not set # CONFIG_FB_VT8623 is not set -# CONFIG_FB_CYBLA is not set # CONFIG_FB_TRIDENT is not set # CONFIG_FB_ARK is not set # CONFIG_FB_PM3 is not set @@ -1652,6 +1689,7 @@ CONFIG_FB_EFI=y # CONFIG_FB_VIRTUAL is not set # CONFIG_FB_METRONOME is not set # CONFIG_FB_MB862XX is not set +# CONFIG_FB_BROADSHEET is not set CONFIG_BACKLIGHT_LCD_SUPPORT=y # CONFIG_LCD_CLASS_DEVICE is not set CONFIG_BACKLIGHT_CLASS_DEVICE=y @@ -1738,6 +1776,8 @@ CONFIG_SND_PCI=y # CONFIG_SND_INDIGO is not set # CONFIG_SND_INDIGOIO is not set # CONFIG_SND_INDIGODJ is not set +# CONFIG_SND_INDIGOIOX is not set +# CONFIG_SND_INDIGODJX is not set # CONFIG_SND_EMU10K1 is not set # CONFIG_SND_EMU10K1X is not set # CONFIG_SND_ENS1370 is not set @@ -1811,15 +1851,17 @@ CONFIG_USB_HIDDEV=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +CONFIG_HID_KYE=y CONFIG_HID_GYRATION=y +CONFIG_HID_KENSINGTON=y CONFIG_HID_LOGITECH=y CONFIG_LOGITECH_FF=y # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -1885,11 +1927,11 @@ CONFIG_USB_PRINTER=y # CONFIG_USB_TMC is not set # -# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may # # -# see USB_STORAGE Help for more information +# also be needed; see USB_STORAGE Help for more info # CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_DEBUG is not set @@ -1931,7 +1973,6 @@ CONFIG_USB_LIBUSUAL=y # CONFIG_USB_LED is not set # CONFIG_USB_CYPRESS_CY7C63 is not set # CONFIG_USB_CYTHERM is not set -# CONFIG_USB_PHIDGET is not set # CONFIG_USB_IDMOUSE is not set # CONFIG_USB_FTDI_ELAN is not set # CONFIG_USB_APPLEDISPLAY is not set @@ -1947,6 +1988,7 @@ CONFIG_USB_LIBUSUAL=y # # OTG and related infrastructure # +# CONFIG_NOP_USB_XCEIV is not set # CONFIG_UWB is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set @@ -1958,8 +2000,10 @@ CONFIG_LEDS_CLASS=y # # CONFIG_LEDS_ALIX2 is not set # CONFIG_LEDS_PCA9532 is not set +# CONFIG_LEDS_LP5521 is not set # CONFIG_LEDS_CLEVO_MAIL is not set # CONFIG_LEDS_PCA955X is not set +# CONFIG_LEDS_BD2802 is not set # # LED Triggers @@ -1969,6 +2013,10 @@ CONFIG_LEDS_TRIGGERS=y # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set + +# +# iptables trigger is under Netfilter config (LED target) +# # CONFIG_ACCESSIBILITY is not set # CONFIG_INFINIBAND is not set CONFIG_EDAC=y @@ -2037,6 +2085,7 @@ CONFIG_DMADEVICES=y # DMA Devices # # CONFIG_INTEL_IOATDMA is not set +# CONFIG_AUXDISPLAY is not set # CONFIG_UIO is not set # CONFIG_STAGING is not set CONFIG_X86_PLATFORM_DEVICES=y @@ -2071,6 +2120,7 @@ CONFIG_DMIID=y # # CONFIG_EXT2_FS is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y CONFIG_EXT3_FS_SECURITY=y @@ -2101,6 +2151,11 @@ CONFIG_AUTOFS4_FS=y CONFIG_GENERIC_ACL=y # +# Caches +# +# CONFIG_FSCACHE is not set + +# # CD-ROM/DVD Filesystems # CONFIG_ISO9660_FS=y @@ -2151,6 +2206,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -2164,7 +2220,6 @@ CONFIG_NFS_ACL_SUPPORT=y CONFIG_NFS_COMMON=y CONFIG_SUNRPC=y CONFIG_SUNRPC_GSS=y -# CONFIG_SUNRPC_REGISTER_V4 is not set CONFIG_RPCSEC_GSS_KRB5=y # CONFIG_RPCSEC_GSS_SPKM3 is not set # CONFIG_SMB_FS is not set @@ -2251,6 +2306,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set # CONFIG_DETECT_SOFTLOCKUP is not set +# CONFIG_DETECT_HUNG_TASK is not set # CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y @@ -2266,6 +2322,7 @@ CONFIG_TIMER_STATS=y # CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +CONFIG_STACKTRACE=y # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_HIGHMEM is not set CONFIG_DEBUG_BUGVERBOSE=y @@ -2289,13 +2346,19 @@ CONFIG_FRAME_POINTER=y # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set CONFIG_SYSCTL_SYSCALL_CHECK=y +# CONFIG_DEBUG_PAGEALLOC is not set CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_HW_BRANCH_TRACER=y +CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_RING_BUFFER=y +CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -2305,13 +2368,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y # CONFIG_SYSPROF_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_FTRACE_SYSCALLS is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_POWER_TRACER is not set # CONFIG_STACK_TRACER is not set # CONFIG_HW_BRANCH_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +CONFIG_BLK_DEV_IO_TRACE=y +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_MMIOTRACE is not set CONFIG_PROVIDE_OHCI1394_DMA_INIT=y -# CONFIG_DYNAMIC_PRINTK_DEBUG is not set +# CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -2321,7 +2392,6 @@ CONFIG_EARLY_PRINTK=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_STACK_USAGE=y -# CONFIG_DEBUG_PAGEALLOC is not set # CONFIG_DEBUG_PER_CPU_MAPS is not set # CONFIG_X86_PTDUMP is not set CONFIG_DEBUG_RODATA=y @@ -2329,7 +2399,7 @@ CONFIG_DEBUG_RODATA=y CONFIG_DEBUG_NX_TEST=m # CONFIG_4KSTACKS is not set CONFIG_DOUBLEFAULT=y -# CONFIG_MMIOTRACE is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y CONFIG_IO_DELAY_TYPE_0X80=0 CONFIG_IO_DELAY_TYPE_0XED=1 CONFIG_IO_DELAY_TYPE_UDELAY=2 @@ -2365,6 +2435,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set # CONFIG_SECURITY_SMACK is not set +# CONFIG_SECURITY_TOMOYO is not set +# CONFIG_IMA is not set CONFIG_CRYPTO=y # @@ -2380,10 +2452,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y CONFIG_CRYPTO_HASH=y CONFIG_CRYPTO_HASH2=y CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_PCOMP=y CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_MANAGER2=y # CONFIG_CRYPTO_GF128MUL is not set # CONFIG_CRYPTO_NULL is not set +CONFIG_CRYPTO_WORKQUEUE=y # CONFIG_CRYPTO_CRYPTD is not set CONFIG_CRYPTO_AUTHENC=y # CONFIG_CRYPTO_TEST is not set @@ -2456,6 +2530,7 @@ CONFIG_CRYPTO_DES=y # Compression # # CONFIG_CRYPTO_DEFLATE is not set +# CONFIG_CRYPTO_ZLIB is not set # CONFIG_CRYPTO_LZO is not set # @@ -2467,11 +2542,13 @@ CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_GEODE is not set # CONFIG_CRYPTO_DEV_HIFN_795X is not set CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y CONFIG_VIRTUALIZATION=y # CONFIG_KVM is not set # CONFIG_LGUEST is not set # CONFIG_VIRTIO_PCI is not set # CONFIG_VIRTIO_BALLOON is not set +CONFIG_BINARY_PRINTF=y # # Library routines @@ -2489,7 +2566,10 @@ CONFIG_CRC32=y # CONFIG_LIBCRC32C is not set CONFIG_AUDIT_GENERIC=y CONFIG_ZLIB_INFLATE=y -CONFIG_PLIST=y +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y CONFIG_HAS_DMA=y +CONFIG_NLATTR=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 9fe5d212ab4..cee1dd2e69b 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,12 +1,13 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29-rc4 -# Tue Feb 24 15:44:16 2009 +# Linux kernel version: 2.6.30-rc2 +# Mon May 11 16:22:00 2009 # CONFIG_64BIT=y # CONFIG_X86_32 is not set CONFIG_X86_64=y CONFIG_X86=y +CONFIG_OUTPUT_FORMAT="elf64-x86-64" CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" CONFIG_GENERIC_TIME=y CONFIG_GENERIC_CMOS_UPDATE=y @@ -34,6 +35,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y CONFIG_ARCH_HAS_DEFAULT_IDLE=y CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y CONFIG_ARCH_HIBERNATION_POSSIBLE=y CONFIG_ARCH_SUSPEND_POSSIBLE=y @@ -41,14 +43,14 @@ CONFIG_ZONE_DMA32=y CONFIG_ARCH_POPULATES_NODE_MAP=y CONFIG_AUDIT_ARCH=y CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_GENERIC_PENDING_IRQ=y -CONFIG_X86_SMP=y CONFIG_USE_GENERIC_SMP_HELPERS=y CONFIG_X86_64_SMP=y CONFIG_X86_HT=y -CONFIG_X86_BIOS_REBOOT=y CONFIG_X86_TRAMPOLINE=y # CONFIG_KTIME_SCALAR is not set CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" @@ -61,10 +63,17 @@ CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 CONFIG_LOCALVERSION="" # CONFIG_LOCALVERSION_AUTO is not set +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_KERNEL_GZIP=y +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set CONFIG_TASKSTATS=y @@ -114,23 +123,26 @@ CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y CONFIG_PCSPKR_PLATFORM=y -# CONFIG_COMPAT_BRK is not set CONFIG_BASE_FULL=y CONFIG_FUTEX=y -CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y CONFIG_TIMERFD=y @@ -140,6 +152,7 @@ CONFIG_AIO=y CONFIG_VM_EVENT_COUNTERS=y CONFIG_PCI_QUIRKS=y CONFIG_SLUB_DEBUG=y +# CONFIG_COMPAT_BRK is not set # CONFIG_SLAB is not set CONFIG_SLUB=y # CONFIG_SLOB is not set @@ -155,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -167,7 +182,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y -CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BLK_DEV_BSG=y # CONFIG_BLK_DEV_INTEGRITY is not set CONFIG_BLOCK_COMPAT=y @@ -195,12 +209,10 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_GENERIC_CLOCKEVENTS_BUILD=y CONFIG_SMP=y CONFIG_SPARSE_IRQ=y -# CONFIG_NUMA_MIGRATE_IRQ_DESC is not set -CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y -# CONFIG_X86_ELAN is not set -# CONFIG_X86_GENERICARCH is not set +CONFIG_X86_EXTENDED_PLATFORM=y # CONFIG_X86_VSMP is not set +# CONFIG_X86_UV is not set CONFIG_SCHED_OMIT_FRAME_POINTER=y # CONFIG_PARAVIRT_GUEST is not set # CONFIG_MEMTEST is not set @@ -230,10 +242,10 @@ CONFIG_SCHED_OMIT_FRAME_POINTER=y # CONFIG_MCORE2 is not set CONFIG_GENERIC_CPU=y CONFIG_X86_CPU=y -CONFIG_X86_L1_CACHE_BYTES=128 -CONFIG_X86_INTERNODE_CACHE_BYTES=128 +CONFIG_X86_L1_CACHE_BYTES=64 +CONFIG_X86_INTERNODE_CACHE_BYTES=64 CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_L1_CACHE_SHIFT=6 CONFIG_X86_WP_WORKS_OK=y CONFIG_X86_TSC=y CONFIG_X86_CMPXCHG64=y @@ -242,7 +254,7 @@ CONFIG_X86_MINIMUM_CPU_FAMILY=64 CONFIG_X86_DEBUGCTLMSR=y CONFIG_CPU_SUP_INTEL=y CONFIG_CPU_SUP_AMD=y -CONFIG_CPU_SUP_CENTAUR_64=y +CONFIG_CPU_SUP_CENTAUR=y CONFIG_X86_DS=y CONFIG_X86_PTRACE_BTS=y CONFIG_HPET_TIMER=y @@ -269,6 +281,7 @@ CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y CONFIG_X86_MCE=y CONFIG_X86_MCE_INTEL=y CONFIG_X86_MCE_AMD=y +CONFIG_X86_MCE_THRESHOLD=y # CONFIG_I8K is not set CONFIG_MICROCODE=y CONFIG_MICROCODE_INTEL=y @@ -276,6 +289,7 @@ CONFIG_MICROCODE_AMD=y CONFIG_MICROCODE_OLD_INTERFACE=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y +# CONFIG_X86_CPU_DEBUG is not set CONFIG_ARCH_PHYS_ADDR_T_64BIT=y CONFIG_DIRECT_GBPAGES=y CONFIG_NUMA=y @@ -309,6 +323,8 @@ CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_UNEVICTABLE_LRU=y +CONFIG_HAVE_MLOCK=y +CONFIG_HAVE_MLOCKED_PAGE_BIT=y CONFIG_X86_CHECK_BIOS_CORRUPTION=y CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y CONFIG_X86_RESERVE_LOW_64K=y @@ -317,6 +333,7 @@ CONFIG_MTRR=y CONFIG_X86_PAT=y CONFIG_EFI=y CONFIG_SECCOMP=y +# CONFIG_CC_STACKPROTECTOR is not set # CONFIG_HZ_100 is not set # CONFIG_HZ_250 is not set # CONFIG_HZ_300 is not set @@ -325,9 +342,10 @@ CONFIG_HZ=1000 CONFIG_SCHED_HRTICK=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y +# CONFIG_KEXEC_JUMP is not set CONFIG_PHYSICAL_START=0x1000000 -# CONFIG_RELOCATABLE is not set -CONFIG_PHYSICAL_ALIGN=0x200000 +CONFIG_RELOCATABLE=y +CONFIG_PHYSICAL_ALIGN=0x1000000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set # CONFIG_CMDLINE_BOOL is not set @@ -370,7 +388,6 @@ CONFIG_ACPI_NUMA=y CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set # CONFIG_ACPI_PCI_SLOT is not set -CONFIG_ACPI_SYSTEM=y CONFIG_X86_PM_TIMER=y CONFIG_ACPI_CONTAINER=y # CONFIG_ACPI_SBS is not set @@ -436,6 +453,7 @@ CONFIG_PCI_MSI=y # CONFIG_PCI_DEBUG is not set # CONFIG_PCI_STUB is not set CONFIG_HT_IRQ=y +# CONFIG_PCI_IOV is not set CONFIG_ISA_DMA_API=y CONFIG_K8_NB=y CONFIG_PCCARD=y @@ -481,7 +499,6 @@ CONFIG_NET=y # # Networking options # -CONFIG_COMPAT_NET_DEV_OPS=y CONFIG_PACKET=y CONFIG_PACKET_MMAP=y CONFIG_UNIX=y @@ -639,6 +656,7 @@ CONFIG_LLC=y # CONFIG_LAPB is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set +# CONFIG_PHONET is not set CONFIG_NET_SCHED=y # @@ -696,6 +714,7 @@ CONFIG_NET_SCH_FIFO=y # # CONFIG_NET_PKTGEN is not set # CONFIG_NET_TCPPROBE is not set +# CONFIG_NET_DROP_MONITOR is not set CONFIG_HAMRADIO=y # @@ -706,12 +725,10 @@ CONFIG_HAMRADIO=y # CONFIG_IRDA is not set # CONFIG_BT is not set # CONFIG_AF_RXRPC is not set -# CONFIG_PHONET is not set CONFIG_FIB_RULES=y CONFIG_WIRELESS=y CONFIG_CFG80211=y # CONFIG_CFG80211_REG_DEBUG is not set -CONFIG_NL80211=y CONFIG_WIRELESS_OLD_REGULATORY=y CONFIG_WIRELESS_EXT=y CONFIG_WIRELESS_EXT_SYSFS=y @@ -788,9 +805,8 @@ CONFIG_MISC_DEVICES=y # CONFIG_TIFM_CORE is not set # CONFIG_ICS932S401 is not set # CONFIG_ENCLOSURE_SERVICES is not set -# CONFIG_SGI_XP is not set # CONFIG_HP_ILO is not set -# CONFIG_SGI_GRU is not set +# CONFIG_ISL29003 is not set # CONFIG_C2PORT is not set # @@ -844,6 +860,7 @@ CONFIG_SCSI_SPI_ATTRS=y # CONFIG_SCSI_LOWLEVEL is not set # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set # CONFIG_SCSI_DH is not set +# CONFIG_SCSI_OSD_INITIATOR is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set CONFIG_ATA_ACPI=y @@ -940,6 +957,7 @@ CONFIG_DM_ZERO=y CONFIG_MACINTOSH_DRIVERS=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_NETDEVICES=y +CONFIG_COMPAT_NET_DEV_OPS=y # CONFIG_IFB is not set # CONFIG_DUMMY is not set # CONFIG_BONDING is not set @@ -977,6 +995,8 @@ CONFIG_MII=y CONFIG_NET_VENDOR_3COM=y # CONFIG_VORTEX is not set # CONFIG_TYPHOON is not set +# CONFIG_ETHOC is not set +# CONFIG_DNET is not set CONFIG_NET_TULIP=y # CONFIG_DE2104X is not set # CONFIG_TULIP is not set @@ -1026,6 +1046,7 @@ CONFIG_E1000=y # CONFIG_E1000E is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -1040,6 +1061,7 @@ CONFIG_TIGON3=y # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set # CONFIG_ATL1E is not set +# CONFIG_ATL1C is not set # CONFIG_JME is not set CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set @@ -1049,6 +1071,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -1058,6 +1081,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_BNX2X is not set # CONFIG_QLGE is not set # CONFIG_SFC is not set +# CONFIG_BE2NET is not set CONFIG_TR=y # CONFIG_IBMOL is not set # CONFIG_3C359 is not set @@ -1072,8 +1096,8 @@ CONFIG_WLAN_80211=y # CONFIG_LIBERTAS is not set # CONFIG_LIBERTAS_THINFIRM is not set # CONFIG_AIRO is not set -# CONFIG_HERMES is not set # CONFIG_ATMEL is not set +# CONFIG_AT76C50X_USB is not set # CONFIG_AIRO_CS is not set # CONFIG_PCMCIA_WL3501 is not set # CONFIG_PRISM54 is not set @@ -1083,21 +1107,21 @@ CONFIG_WLAN_80211=y # CONFIG_RTL8187 is not set # CONFIG_ADM8211 is not set # CONFIG_MAC80211_HWSIM is not set +# CONFIG_MWL8K is not set # CONFIG_P54_COMMON is not set CONFIG_ATH5K=y # CONFIG_ATH5K_DEBUG is not set # CONFIG_ATH9K is not set +# CONFIG_AR9170_USB is not set # CONFIG_IPW2100 is not set # CONFIG_IPW2200 is not set -# CONFIG_IWLCORE is not set -# CONFIG_IWLWIFI_LEDS is not set -# CONFIG_IWLAGN is not set -# CONFIG_IWL3945 is not set +# CONFIG_IWLWIFI is not set # CONFIG_HOSTAP is not set # CONFIG_B43 is not set # CONFIG_B43LEGACY is not set # CONFIG_ZD1211RW is not set # CONFIG_RT2X00 is not set +# CONFIG_HERMES is not set # # Enable WiMAX (Networking options) to see the WiMAX drivers @@ -1208,6 +1232,8 @@ CONFIG_INPUT_TABLET=y # CONFIG_TABLET_USB_KBTAB is not set # CONFIG_TABLET_USB_WACOM is not set CONFIG_INPUT_TOUCHSCREEN=y +# CONFIG_TOUCHSCREEN_AD7879_I2C is not set +# CONFIG_TOUCHSCREEN_AD7879 is not set # CONFIG_TOUCHSCREEN_FUJITSU is not set # CONFIG_TOUCHSCREEN_GUNZE is not set # CONFIG_TOUCHSCREEN_ELO is not set @@ -1301,6 +1327,7 @@ CONFIG_UNIX98_PTYS=y # CONFIG_LEGACY_PTYS is not set # CONFIG_IPMI_HANDLER is not set CONFIG_HW_RANDOM=y +# CONFIG_HW_RANDOM_TIMERIOMEM is not set # CONFIG_HW_RANDOM_INTEL is not set # CONFIG_HW_RANDOM_AMD is not set CONFIG_NVRAM=y @@ -1382,7 +1409,6 @@ CONFIG_I2C_I801=y # CONFIG_SENSORS_PCF8574 is not set # CONFIG_PCF8575 is not set # CONFIG_SENSORS_PCA9539 is not set -# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_MAX6875 is not set # CONFIG_SENSORS_TSL2550 is not set # CONFIG_I2C_DEBUG_CORE is not set @@ -1416,6 +1442,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_ADT7475 is not set # CONFIG_SENSORS_K8TEMP is not set # CONFIG_SENSORS_ASB100 is not set +# CONFIG_SENSORS_ATK0110 is not set # CONFIG_SENSORS_ATXP1 is not set # CONFIG_SENSORS_DS1621 is not set # CONFIG_SENSORS_I5K_AMB is not set @@ -1425,6 +1452,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_FSCHER is not set # CONFIG_SENSORS_FSCPOS is not set # CONFIG_SENSORS_FSCHMD is not set +# CONFIG_SENSORS_G760A is not set # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_CORETEMP is not set @@ -1440,11 +1468,14 @@ CONFIG_HWMON=y # CONFIG_SENSORS_LM90 is not set # CONFIG_SENSORS_LM92 is not set # CONFIG_SENSORS_LM93 is not set +# CONFIG_SENSORS_LTC4215 is not set # CONFIG_SENSORS_LTC4245 is not set +# CONFIG_SENSORS_LM95241 is not set # CONFIG_SENSORS_MAX1619 is not set # CONFIG_SENSORS_MAX6650 is not set # CONFIG_SENSORS_PC87360 is not set # CONFIG_SENSORS_PC87427 is not set +# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_SIS5595 is not set # CONFIG_SENSORS_DME1737 is not set # CONFIG_SENSORS_SMSC47M1 is not set @@ -1635,6 +1666,7 @@ CONFIG_FB_EFI=y # CONFIG_FB_VIRTUAL is not set # CONFIG_FB_METRONOME is not set # CONFIG_FB_MB862XX is not set +# CONFIG_FB_BROADSHEET is not set CONFIG_BACKLIGHT_LCD_SUPPORT=y # CONFIG_LCD_CLASS_DEVICE is not set CONFIG_BACKLIGHT_CLASS_DEVICE=y @@ -1720,6 +1752,8 @@ CONFIG_SND_PCI=y # CONFIG_SND_INDIGO is not set # CONFIG_SND_INDIGOIO is not set # CONFIG_SND_INDIGODJ is not set +# CONFIG_SND_INDIGOIOX is not set +# CONFIG_SND_INDIGODJX is not set # CONFIG_SND_EMU10K1 is not set # CONFIG_SND_EMU10K1X is not set # CONFIG_SND_ENS1370 is not set @@ -1792,15 +1826,17 @@ CONFIG_USB_HIDDEV=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +CONFIG_HID_KYE=y CONFIG_HID_GYRATION=y +CONFIG_HID_KENSINGTON=y CONFIG_HID_LOGITECH=y CONFIG_LOGITECH_FF=y # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -1866,11 +1902,11 @@ CONFIG_USB_PRINTER=y # CONFIG_USB_TMC is not set # -# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may # # -# see USB_STORAGE Help for more information +# also be needed; see USB_STORAGE Help for more info # CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_DEBUG is not set @@ -1912,7 +1948,6 @@ CONFIG_USB_LIBUSUAL=y # CONFIG_USB_LED is not set # CONFIG_USB_CYPRESS_CY7C63 is not set # CONFIG_USB_CYTHERM is not set -# CONFIG_USB_PHIDGET is not set # CONFIG_USB_IDMOUSE is not set # CONFIG_USB_FTDI_ELAN is not set # CONFIG_USB_APPLEDISPLAY is not set @@ -1928,6 +1963,7 @@ CONFIG_USB_LIBUSUAL=y # # OTG and related infrastructure # +# CONFIG_NOP_USB_XCEIV is not set # CONFIG_UWB is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set @@ -1939,8 +1975,10 @@ CONFIG_LEDS_CLASS=y # # CONFIG_LEDS_ALIX2 is not set # CONFIG_LEDS_PCA9532 is not set +# CONFIG_LEDS_LP5521 is not set # CONFIG_LEDS_CLEVO_MAIL is not set # CONFIG_LEDS_PCA955X is not set +# CONFIG_LEDS_BD2802 is not set # # LED Triggers @@ -1950,6 +1988,10 @@ CONFIG_LEDS_TRIGGERS=y # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set + +# +# iptables trigger is under Netfilter config (LED target) +# # CONFIG_ACCESSIBILITY is not set # CONFIG_INFINIBAND is not set CONFIG_EDAC=y @@ -2018,6 +2060,7 @@ CONFIG_DMADEVICES=y # DMA Devices # # CONFIG_INTEL_IOATDMA is not set +# CONFIG_AUXDISPLAY is not set # CONFIG_UIO is not set # CONFIG_STAGING is not set CONFIG_X86_PLATFORM_DEVICES=y @@ -2051,6 +2094,7 @@ CONFIG_DMIID=y # # CONFIG_EXT2_FS is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y CONFIG_EXT3_FS_SECURITY=y @@ -2082,6 +2126,11 @@ CONFIG_AUTOFS4_FS=y CONFIG_GENERIC_ACL=y # +# Caches +# +# CONFIG_FSCACHE is not set + +# # CD-ROM/DVD Filesystems # CONFIG_ISO9660_FS=y @@ -2132,6 +2181,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -2145,7 +2195,6 @@ CONFIG_NFS_ACL_SUPPORT=y CONFIG_NFS_COMMON=y CONFIG_SUNRPC=y CONFIG_SUNRPC_GSS=y -# CONFIG_SUNRPC_REGISTER_V4 is not set CONFIG_RPCSEC_GSS_KRB5=y # CONFIG_RPCSEC_GSS_SPKM3 is not set # CONFIG_SMB_FS is not set @@ -2232,6 +2281,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set # CONFIG_DETECT_SOFTLOCKUP is not set +# CONFIG_DETECT_HUNG_TASK is not set # CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y @@ -2247,6 +2297,7 @@ CONFIG_TIMER_STATS=y # CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +CONFIG_STACKTRACE=y # CONFIG_DEBUG_KOBJECT is not set CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_INFO is not set @@ -2269,13 +2320,19 @@ CONFIG_FRAME_POINTER=y # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set CONFIG_SYSCTL_SYSCALL_CHECK=y +# CONFIG_DEBUG_PAGEALLOC is not set CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_HW_BRANCH_TRACER=y +CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_RING_BUFFER=y +CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -2285,13 +2342,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y # CONFIG_SYSPROF_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_FTRACE_SYSCALLS is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_POWER_TRACER is not set # CONFIG_STACK_TRACER is not set # CONFIG_HW_BRANCH_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +CONFIG_BLK_DEV_IO_TRACE=y +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_MMIOTRACE is not set CONFIG_PROVIDE_OHCI1394_DMA_INIT=y -# CONFIG_DYNAMIC_PRINTK_DEBUG is not set +# CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -2301,14 +2366,13 @@ CONFIG_EARLY_PRINTK=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_STACK_USAGE=y -# CONFIG_DEBUG_PAGEALLOC is not set # CONFIG_DEBUG_PER_CPU_MAPS is not set # CONFIG_X86_PTDUMP is not set CONFIG_DEBUG_RODATA=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_NX_TEST=m # CONFIG_IOMMU_DEBUG is not set -# CONFIG_MMIOTRACE is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y CONFIG_IO_DELAY_TYPE_0X80=0 CONFIG_IO_DELAY_TYPE_0XED=1 CONFIG_IO_DELAY_TYPE_UDELAY=2 @@ -2344,6 +2408,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set # CONFIG_SECURITY_SMACK is not set +# CONFIG_SECURITY_TOMOYO is not set +# CONFIG_IMA is not set CONFIG_CRYPTO=y # @@ -2359,10 +2425,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y CONFIG_CRYPTO_HASH=y CONFIG_CRYPTO_HASH2=y CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_PCOMP=y CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_MANAGER2=y # CONFIG_CRYPTO_GF128MUL is not set # CONFIG_CRYPTO_NULL is not set +CONFIG_CRYPTO_WORKQUEUE=y # CONFIG_CRYPTO_CRYPTD is not set CONFIG_CRYPTO_AUTHENC=y # CONFIG_CRYPTO_TEST is not set @@ -2414,6 +2482,7 @@ CONFIG_CRYPTO_SHA1=y # CONFIG_CRYPTO_AES=y # CONFIG_CRYPTO_AES_X86_64 is not set +# CONFIG_CRYPTO_AES_NI_INTEL is not set # CONFIG_CRYPTO_ANUBIS is not set CONFIG_CRYPTO_ARC4=y # CONFIG_CRYPTO_BLOWFISH is not set @@ -2435,6 +2504,7 @@ CONFIG_CRYPTO_DES=y # Compression # # CONFIG_CRYPTO_DEFLATE is not set +# CONFIG_CRYPTO_ZLIB is not set # CONFIG_CRYPTO_LZO is not set # @@ -2444,10 +2514,12 @@ CONFIG_CRYPTO_DES=y CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_HIFN_795X is not set CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y CONFIG_VIRTUALIZATION=y # CONFIG_KVM is not set # CONFIG_VIRTIO_PCI is not set # CONFIG_VIRTIO_BALLOON is not set +CONFIG_BINARY_PRINTF=y # # Library routines @@ -2464,7 +2536,10 @@ CONFIG_CRC32=y # CONFIG_CRC7 is not set # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y -CONFIG_PLIST=y +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y CONFIG_HAS_DMA=y +CONFIG_NLATTR=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index ebe7deedd5b..cfb0010fa94 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -2,6 +2,8 @@ # Arch-specific CryptoAPI modules. # +obj-$(CONFIG_CRYPTO_FPU) += fpu.o + obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index caba9960170..eb0566e8331 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -845,7 +845,7 @@ ENTRY(aesni_cbc_enc) */ ENTRY(aesni_cbc_dec) cmp $16, LEN - jb .Lcbc_dec_ret + jb .Lcbc_dec_just_ret mov 480(KEYP), KLEN add $240, KEYP movups (IVP), IV @@ -891,6 +891,7 @@ ENTRY(aesni_cbc_dec) add $16, OUTP cmp $16, LEN jge .Lcbc_dec_loop1 - movups IV, (IVP) .Lcbc_dec_ret: + movups IV, (IVP) +.Lcbc_dec_just_ret: ret diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 02af0af6549..c580c5ec1ca 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -21,6 +21,22 @@ #include <asm/i387.h> #include <asm/aes.h> +#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) +#define HAS_CTR +#endif + +#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE) +#define HAS_LRW +#endif + +#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) +#define HAS_PCBC +#endif + +#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE) +#define HAS_XTS +#endif + struct async_aes_ctx { struct cryptd_ablkcipher *cryptd_tfm; }; @@ -137,6 +153,41 @@ static struct crypto_alg aesni_alg = { } }; +static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); + + aesni_enc(ctx, dst, src); +} + +static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); + + aesni_dec(ctx, dst, src); +} + +static struct crypto_alg __aesni_alg = { + .cra_name = "__aes-aesni", + .cra_driver_name = "__driver-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1, + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(__aesni_alg.cra_list), + .cra_u = { + .cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = aes_set_key, + .cia_encrypt = __aes_encrypt, + .cia_decrypt = __aes_decrypt + } + } +}; + static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) @@ -147,6 +198,7 @@ static int ecb_encrypt(struct blkcipher_desc *desc, blkcipher_walk_init(&walk, dst, src, nbytes); err = blkcipher_walk_virt(desc, &walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { @@ -170,6 +222,7 @@ static int ecb_decrypt(struct blkcipher_desc *desc, blkcipher_walk_init(&walk, dst, src, nbytes); err = blkcipher_walk_virt(desc, &walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { @@ -215,6 +268,7 @@ static int cbc_encrypt(struct blkcipher_desc *desc, blkcipher_walk_init(&walk, dst, src, nbytes); err = blkcipher_walk_virt(desc, &walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { @@ -238,6 +292,7 @@ static int cbc_decrypt(struct blkcipher_desc *desc, blkcipher_walk_init(&walk, dst, src, nbytes); err = blkcipher_walk_virt(desc, &walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { @@ -277,8 +332,16 @@ static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, unsigned int key_len) { struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; + int err; - return crypto_ablkcipher_setkey(&ctx->cryptd_tfm->base, key, key_len); + crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) + & CRYPTO_TFM_REQ_MASK); + err = crypto_ablkcipher_setkey(child, key, key_len); + crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) + & CRYPTO_TFM_RES_MASK); + return err; } static int ablk_encrypt(struct ablkcipher_request *req) @@ -411,6 +474,163 @@ static struct crypto_alg ablk_cbc_alg = { }, }; +#ifdef HAS_CTR +static int ablk_ctr_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))", + 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_ctr_alg = { + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list), + .cra_init = ablk_ctr_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + .geniv = "chainiv", + }, + }, +}; +#endif + +#ifdef HAS_LRW +static int ablk_lrw_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("fpu(lrw(__driver-aes-aesni))", + 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_lrw_alg = { + .cra_name = "lrw(aes)", + .cra_driver_name = "lrw-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list), + .cra_init = ablk_lrw_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, + .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; +#endif + +#ifdef HAS_PCBC +static int ablk_pcbc_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("fpu(pcbc(__driver-aes-aesni))", + 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_pcbc_alg = { + .cra_name = "pcbc(aes)", + .cra_driver_name = "pcbc-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_pcbc_alg.cra_list), + .cra_init = ablk_pcbc_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; +#endif + +#ifdef HAS_XTS +static int ablk_xts_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("fpu(xts(__driver-aes-aesni))", + 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_xts_alg = { + .cra_name = "xts(aes)", + .cra_driver_name = "xts-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list), + .cra_init = ablk_xts_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; +#endif + static int __init aesni_init(void) { int err; @@ -421,6 +641,8 @@ static int __init aesni_init(void) } if ((err = crypto_register_alg(&aesni_alg))) goto aes_err; + if ((err = crypto_register_alg(&__aesni_alg))) + goto __aes_err; if ((err = crypto_register_alg(&blk_ecb_alg))) goto blk_ecb_err; if ((err = crypto_register_alg(&blk_cbc_alg))) @@ -429,9 +651,41 @@ static int __init aesni_init(void) goto ablk_ecb_err; if ((err = crypto_register_alg(&ablk_cbc_alg))) goto ablk_cbc_err; +#ifdef HAS_CTR + if ((err = crypto_register_alg(&ablk_ctr_alg))) + goto ablk_ctr_err; +#endif +#ifdef HAS_LRW + if ((err = crypto_register_alg(&ablk_lrw_alg))) + goto ablk_lrw_err; +#endif +#ifdef HAS_PCBC + if ((err = crypto_register_alg(&ablk_pcbc_alg))) + goto ablk_pcbc_err; +#endif +#ifdef HAS_XTS + if ((err = crypto_register_alg(&ablk_xts_alg))) + goto ablk_xts_err; +#endif return err; +#ifdef HAS_XTS +ablk_xts_err: +#endif +#ifdef HAS_PCBC + crypto_unregister_alg(&ablk_pcbc_alg); +ablk_pcbc_err: +#endif +#ifdef HAS_LRW + crypto_unregister_alg(&ablk_lrw_alg); +ablk_lrw_err: +#endif +#ifdef HAS_CTR + crypto_unregister_alg(&ablk_ctr_alg); +ablk_ctr_err: +#endif + crypto_unregister_alg(&ablk_cbc_alg); ablk_cbc_err: crypto_unregister_alg(&ablk_ecb_alg); ablk_ecb_err: @@ -439,6 +693,8 @@ ablk_ecb_err: blk_cbc_err: crypto_unregister_alg(&blk_ecb_alg); blk_ecb_err: + crypto_unregister_alg(&__aesni_alg); +__aes_err: crypto_unregister_alg(&aesni_alg); aes_err: return err; @@ -446,10 +702,23 @@ aes_err: static void __exit aesni_exit(void) { +#ifdef HAS_XTS + crypto_unregister_alg(&ablk_xts_alg); +#endif +#ifdef HAS_PCBC + crypto_unregister_alg(&ablk_pcbc_alg); +#endif +#ifdef HAS_LRW + crypto_unregister_alg(&ablk_lrw_alg); +#endif +#ifdef HAS_CTR + crypto_unregister_alg(&ablk_ctr_alg); +#endif crypto_unregister_alg(&ablk_cbc_alg); crypto_unregister_alg(&ablk_ecb_alg); crypto_unregister_alg(&blk_cbc_alg); crypto_unregister_alg(&blk_ecb_alg); + crypto_unregister_alg(&__aesni_alg); crypto_unregister_alg(&aesni_alg); } diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c new file mode 100644 index 00000000000..daef6cd2b45 --- /dev/null +++ b/arch/x86/crypto/fpu.c @@ -0,0 +1,166 @@ +/* + * FPU: Wrapper for blkcipher touching fpu + * + * Copyright (c) Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <crypto/algapi.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <asm/i387.h> + +struct crypto_fpu_ctx { + struct crypto_blkcipher *child; +}; + +static int crypto_fpu_setkey(struct crypto_tfm *parent, const u8 *key, + unsigned int keylen) +{ + struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(parent); + struct crypto_blkcipher *child = ctx->child; + int err; + + crypto_blkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_blkcipher_set_flags(child, crypto_tfm_get_flags(parent) & + CRYPTO_TFM_REQ_MASK); + err = crypto_blkcipher_setkey(child, key, keylen); + crypto_tfm_set_flags(parent, crypto_blkcipher_get_flags(child) & + CRYPTO_TFM_RES_MASK); + return err; +} + +static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + int err; + struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm); + struct crypto_blkcipher *child = ctx->child; + struct blkcipher_desc desc = { + .tfm = child, + .info = desc_in->info, + .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, + }; + + kernel_fpu_begin(); + err = crypto_blkcipher_crt(desc.tfm)->encrypt(&desc, dst, src, nbytes); + kernel_fpu_end(); + return err; +} + +static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + int err; + struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm); + struct crypto_blkcipher *child = ctx->child; + struct blkcipher_desc desc = { + .tfm = child, + .info = desc_in->info, + .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, + }; + + kernel_fpu_begin(); + err = crypto_blkcipher_crt(desc.tfm)->decrypt(&desc, dst, src, nbytes); + kernel_fpu_end(); + return err; +} + +static int crypto_fpu_init_tfm(struct crypto_tfm *tfm) +{ + struct crypto_instance *inst = crypto_tfm_alg_instance(tfm); + struct crypto_spawn *spawn = crypto_instance_ctx(inst); + struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm); + struct crypto_blkcipher *cipher; + + cipher = crypto_spawn_blkcipher(spawn); + if (IS_ERR(cipher)) + return PTR_ERR(cipher); + + ctx->child = cipher; + return 0; +} + +static void crypto_fpu_exit_tfm(struct crypto_tfm *tfm) +{ + struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm); + crypto_free_blkcipher(ctx->child); +} + +static struct crypto_instance *crypto_fpu_alloc(struct rtattr **tb) +{ + struct crypto_instance *inst; + struct crypto_alg *alg; + int err; + + err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER); + if (err) + return ERR_PTR(err); + + alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_BLKCIPHER, + CRYPTO_ALG_TYPE_MASK); + if (IS_ERR(alg)) + return ERR_CAST(alg); + + inst = crypto_alloc_instance("fpu", alg); + if (IS_ERR(inst)) + goto out_put_alg; + + inst->alg.cra_flags = alg->cra_flags; + inst->alg.cra_priority = alg->cra_priority; + inst->alg.cra_blocksize = alg->cra_blocksize; + inst->alg.cra_alignmask = alg->cra_alignmask; + inst->alg.cra_type = alg->cra_type; + inst->alg.cra_blkcipher.ivsize = alg->cra_blkcipher.ivsize; + inst->alg.cra_blkcipher.min_keysize = alg->cra_blkcipher.min_keysize; + inst->alg.cra_blkcipher.max_keysize = alg->cra_blkcipher.max_keysize; + inst->alg.cra_ctxsize = sizeof(struct crypto_fpu_ctx); + inst->alg.cra_init = crypto_fpu_init_tfm; + inst->alg.cra_exit = crypto_fpu_exit_tfm; + inst->alg.cra_blkcipher.setkey = crypto_fpu_setkey; + inst->alg.cra_blkcipher.encrypt = crypto_fpu_encrypt; + inst->alg.cra_blkcipher.decrypt = crypto_fpu_decrypt; + +out_put_alg: + crypto_mod_put(alg); + return inst; +} + +static void crypto_fpu_free(struct crypto_instance *inst) +{ + crypto_drop_spawn(crypto_instance_ctx(inst)); + kfree(inst); +} + +static struct crypto_template crypto_fpu_tmpl = { + .name = "fpu", + .alloc = crypto_fpu_alloc, + .free = crypto_fpu_free, + .module = THIS_MODULE, +}; + +static int __init crypto_fpu_module_init(void) +{ + return crypto_register_template(&crypto_fpu_tmpl); +} + +static void __exit crypto_fpu_module_exit(void) +{ + crypto_unregister_template(&crypto_fpu_tmpl); +} + +module_init(crypto_fpu_module_init); +module_exit(crypto_fpu_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("FPU block cipher wrapper"); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a505202086e..e590261ba05 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -825,9 +825,11 @@ ia32_sys_call_table: .quad compat_sys_signalfd4 .quad sys_eventfd2 .quad sys_epoll_create1 - .quad sys_dup3 /* 330 */ + .quad sys_dup3 /* 330 */ .quad sys_pipe2 .quad sys_inotify_init1 .quad compat_sys_preadv .quad compat_sys_pwritev + .quad compat_sys_rt_tgsigqueueinfo /* 335 */ + .quad sys_perf_counter_open ia32_syscall_end: diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 4518dc50090..20d1465a2ab 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -144,6 +144,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) #else /* !CONFIG_ACPI */ +#define acpi_disabled 1 #define acpi_lapic 0 #define acpi_ioapic 0 static inline void acpi_noirq_set(void) { } diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index f6aa18eadf7..1a37bcdc860 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -3,6 +3,7 @@ #include <linux/types.h> #include <linux/stddef.h> +#include <linux/stringify.h> #include <asm/asm.h> /* @@ -74,6 +75,22 @@ static inline void alternatives_smp_switch(int smp) {} const unsigned char *const *find_nop_table(void); +/* alternative assembly primitive: */ +#define ALTERNATIVE(oldinstr, newinstr, feature) \ + \ + "661:\n\t" oldinstr "\n662:\n" \ + ".section .altinstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR "661b\n" /* label */ \ + _ASM_PTR "663f\n" /* new instruction */ \ + " .byte " __stringify(feature) "\n" /* feature bit */ \ + " .byte 662b-661b\n" /* sourcelen */ \ + " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .altinstr_replacement, \"ax\"\n" \ + "663:\n\t" newinstr "\n664:\n" /* replacement */ \ + ".previous" + /* * Alternative instructions for different CPU types or capabilities. * @@ -87,18 +104,7 @@ const unsigned char *const *find_nop_table(void); * without volatile and memory clobber. */ #define alternative(oldinstr, newinstr, feature) \ - asm volatile ("661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR "661b\n" /* label */ \ - _ASM_PTR "663f\n" /* new instruction */ \ - " .byte %c0\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .altinstr_replacement,\"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" :: "i" (feature) : "memory") + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") /* * Alternative inline assembly with input. @@ -109,35 +115,16 @@ const unsigned char *const *find_nop_table(void); * Best is to use constraints that are fixed size (like (%1) ... "r") * If you use variable sized constraints like "m" or "g" in the * replacement make sure to pad to the worst case length. + * Leaving an unused argument 0 to keep API compatibility. */ #define alternative_input(oldinstr, newinstr, feature, input...) \ - asm volatile ("661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR "661b\n" /* label */ \ - _ASM_PTR "663f\n" /* new instruction */ \ - " .byte %c0\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .altinstr_replacement,\"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" :: "i" (feature), ##input) + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ + : : "i" (0), ## input) /* Like alternative_input, but with a single output argument */ #define alternative_io(oldinstr, newinstr, feature, output, input...) \ - asm volatile ("661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR "661b\n" /* label */ \ - _ASM_PTR "663f\n" /* new instruction */ \ - " .byte %c[feat]\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .altinstr_replacement,\"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" : output : [feat] "i" (feature), ##input) + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ + : output : "i" (0), ## input) /* * use this macro(s) if you need more than one output parameter diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index f712344329b..bdf96f119f0 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -27,9 +27,13 @@ extern int amd_iommu_init(void); extern int amd_iommu_init_dma_ops(void); extern void amd_iommu_detect(void); extern irqreturn_t amd_iommu_int_handler(int irq, void *data); +extern void amd_iommu_flush_all_domains(void); +extern void amd_iommu_flush_all_devices(void); +extern void amd_iommu_shutdown(void); #else static inline int amd_iommu_init(void) { return -ENODEV; } static inline void amd_iommu_detect(void) { } +static inline void amd_iommu_shutdown(void) { } #endif #endif /* _ASM_X86_AMD_IOMMU_H */ diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 95c8cd9d22b..0c878caaa0a 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -194,6 +194,27 @@ #define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */ #define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops domain for an IOMMU */ +extern bool amd_iommu_dump; +#define DUMP_printk(format, arg...) \ + do { \ + if (amd_iommu_dump) \ + printk(KERN_INFO "AMD IOMMU: " format, ## arg); \ + } while(0); + +/* + * Make iterating over all IOMMUs easier + */ +#define for_each_iommu(iommu) \ + list_for_each_entry((iommu), &amd_iommu_list, list) +#define for_each_iommu_safe(iommu, next) \ + list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list) + +#define APERTURE_RANGE_SHIFT 27 /* 128 MB */ +#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT) +#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT) +#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */ +#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT) +#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL) /* * This structure contains generic data for IOMMU protection domains @@ -210,6 +231,26 @@ struct protection_domain { }; /* + * For dynamic growth the aperture size is split into ranges of 128MB of + * DMA address space each. This struct represents one such range. + */ +struct aperture_range { + + /* address allocation bitmap */ + unsigned long *bitmap; + + /* + * Array of PTE pages for the aperture. In this array we save all the + * leaf pages of the domain page table used for the aperture. This way + * we don't need to walk the page table to find a specific PTE. We can + * just calculate its address in constant time. + */ + u64 *pte_pages[64]; + + unsigned long offset; +}; + +/* * Data container for a dma_ops specific protection domain */ struct dma_ops_domain { @@ -222,18 +263,10 @@ struct dma_ops_domain { unsigned long aperture_size; /* address we start to search for free addresses */ - unsigned long next_bit; - - /* address allocation bitmap */ - unsigned long *bitmap; + unsigned long next_address; - /* - * Array of PTE pages for the aperture. In this array we save all the - * leaf pages of the domain page table used for the aperture. This way - * we don't need to walk the page table to find a specific PTE. We can - * just calculate its address in constant time. - */ - u64 **pte_pages; + /* address space relevant data */ + struct aperture_range *aperture[APERTURE_MAX_RANGES]; /* This will be set to true when TLB needs to be flushed */ bool need_flush; diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 42f2f837742..bb7d4792584 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -107,8 +107,7 @@ extern u32 native_safe_apic_wait_icr_idle(void); extern void native_apic_icr_write(u32 low, u32 id); extern u64 native_apic_icr_read(void); -#define EIM_8BIT_APIC_ID 0 -#define EIM_32BIT_APIC_ID 1 +extern int x2apic_mode; #ifdef CONFIG_X86_X2APIC /* @@ -166,10 +165,9 @@ static inline u64 native_x2apic_icr_read(void) return val; } -extern int x2apic, x2apic_phys; +extern int x2apic_phys; extern void check_x2apic(void); extern void enable_x2apic(void); -extern void enable_IR_x2apic(void); extern void x2apic_icr_write(u32 low, u32 id); static inline int x2apic_enabled(void) { @@ -183,6 +181,8 @@ static inline int x2apic_enabled(void) return 1; return 0; } + +#define x2apic_supported() (cpu_has_x2apic) #else static inline void check_x2apic(void) { @@ -190,28 +190,20 @@ static inline void check_x2apic(void) static inline void enable_x2apic(void) { } -static inline void enable_IR_x2apic(void) -{ -} static inline int x2apic_enabled(void) { return 0; } -#define x2apic 0 - +#define x2apic_preenabled 0 +#define x2apic_supported() 0 #endif -extern int get_physical_broadcast(void); +extern void enable_IR_x2apic(void); -#ifdef CONFIG_X86_X2APIC -static inline void ack_x2APIC_irq(void) -{ - /* Docs say use 0 for future compatibility */ - native_apic_msr_write(APIC_EOI, 0); -} -#endif +extern int get_physical_broadcast(void); +extern void apic_disable(void); extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); extern void connect_bsp_APIC(void); @@ -252,7 +244,7 @@ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } - +static inline void apic_disable(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_64 @@ -410,7 +402,7 @@ static inline unsigned default_get_apic_id(unsigned long x) { unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (APIC_XAPIC(ver)) + if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID)) return (x >> 24) & 0xFF; else return (x >> 24) & 0x0F; @@ -478,6 +470,9 @@ static inline unsigned int read_apic_id(void) extern void default_setup_apic_routing(void); #ifdef CONFIG_X86_32 + +extern struct apic apic_default; + /* * Set up the logical destination ID. * diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index bc9514fb3b1..7ddb36ab933 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -22,6 +22,7 @@ # define APIC_INTEGRATED(x) (1) #endif #define APIC_XAPIC(x) ((x) >= 0x14) +#define APIC_EXT_SPACE(x) ((x) & 0x80000000) #define APIC_TASKPRI 0x80 #define APIC_TPRI_MASK 0xFFu #define APIC_ARBPRI 0x90 @@ -116,7 +117,9 @@ #define APIC_TDR_DIV_32 0x8 #define APIC_TDR_DIV_64 0x9 #define APIC_TDR_DIV_128 0xA -#define APIC_EILVT0 0x500 +#define APIC_EFEAT 0x400 +#define APIC_ECTRL 0x410 +#define APIC_EILVTn(n) (0x500 + 0x10 * n) #define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ #define APIC_EILVT_NR_AMD_10H 4 #define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) @@ -125,9 +128,6 @@ #define APIC_EILVT_MSG_NMI 0x4 #define APIC_EILVT_MSG_EXT 0x7 #define APIC_EILVT_MASKED (1 << 16) -#define APIC_EILVT1 0x510 -#define APIC_EILVT2 0x520 -#define APIC_EILVT3 0x530 #define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) #define APIC_BASE_MSR 0x800 diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index 85b46fba422..dc5a667ff79 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -19,7 +19,10 @@ * * Atomically reads the value of @v. */ -#define atomic_read(v) ((v)->counter) +static inline int atomic_read(const atomic_t *v) +{ + return v->counter; +} /** * atomic_set - set atomic variable @@ -28,7 +31,10 @@ * * Atomically sets the value of @v to @i. */ -#define atomic_set(v, i) (((v)->counter) = (i)) +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} /** * atomic_add - add integer to atomic variable @@ -200,8 +206,15 @@ static inline int atomic_sub_return(int i, atomic_t *v) return atomic_add_return(-i, v); } -#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) -#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) +static inline int atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline int atomic_xchg(atomic_t *v, int new) +{ + return xchg(&v->counter, new); +} /** * atomic_add_unless - add unless the number is already a given value @@ -247,5 +260,156 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -#include <asm-generic/atomic.h> +/* An 64bit atomic type */ + +typedef struct { + u64 __aligned(8) counter; +} atomic64_t; + +#define ATOMIC64_INIT(val) { (val) } + +extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val); + +/** + * atomic64_xchg - xchg atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically xchgs the value of @ptr to @new_val and returns + * the old value. + */ +extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val); + +/** + * atomic64_set - set atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically sets the value of @ptr to @new_val. + */ +extern void atomic64_set(atomic64_t *ptr, u64 new_val); + +/** + * atomic64_read - read atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically reads the value of @ptr and returns it. + */ +static inline u64 atomic64_read(atomic64_t *ptr) +{ + u64 res; + + /* + * Note, we inline this atomic64_t primitive because + * it only clobbers EAX/EDX and leaves the others + * untouched. We also (somewhat subtly) rely on the + * fact that cmpxchg8b returns the current 64-bit value + * of the memory location we are touching: + */ + asm volatile( + "mov %%ebx, %%eax\n\t" + "mov %%ecx, %%edx\n\t" + LOCK_PREFIX "cmpxchg8b %1\n" + : "=&A" (res) + : "m" (*ptr) + ); + + return res; +} + +extern u64 atomic64_read(atomic64_t *ptr); + +/** + * atomic64_add_return - add and return + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns @delta + *@ptr + */ +extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr); + +/* + * Other variants with different arithmetic operators: + */ +extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr); +extern u64 atomic64_inc_return(atomic64_t *ptr); +extern u64 atomic64_dec_return(atomic64_t *ptr); + +/** + * atomic64_add - add integer to atomic64 variable + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr. + */ +extern void atomic64_add(u64 delta, atomic64_t *ptr); + +/** + * atomic64_sub - subtract the atomic64 variable + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr. + */ +extern void atomic64_sub(u64 delta, atomic64_t *ptr); + +/** + * atomic64_sub_and_test - subtract value from variable and test result + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr and returns + * true if the result is zero, or false for all + * other cases. + */ +extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr); + +/** + * atomic64_inc - increment atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1. + */ +extern void atomic64_inc(atomic64_t *ptr); + +/** + * atomic64_dec - decrement atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1. + */ +extern void atomic64_dec(atomic64_t *ptr); + +/** + * atomic64_dec_and_test - decrement and test + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +extern int atomic64_dec_and_test(atomic64_t *ptr); + +/** + * atomic64_inc_and_test - increment and test + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +extern int atomic64_inc_and_test(atomic64_t *ptr); + +/** + * atomic64_add_negative - add and test if negative + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +extern int atomic64_add_negative(u64 delta, atomic64_t *ptr); + +#include <asm-generic/atomic-long.h> #endif /* _ASM_X86_ATOMIC_32_H */ diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h index 8c21731984d..d605dc268e7 100644 --- a/arch/x86/include/asm/atomic_64.h +++ b/arch/x86/include/asm/atomic_64.h @@ -18,7 +18,10 @@ * * Atomically reads the value of @v. */ -#define atomic_read(v) ((v)->counter) +static inline int atomic_read(const atomic_t *v) +{ + return v->counter; +} /** * atomic_set - set atomic variable @@ -27,7 +30,10 @@ * * Atomically sets the value of @v to @i. */ -#define atomic_set(v, i) (((v)->counter) = (i)) +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} /** * atomic_add - add integer to atomic variable @@ -192,7 +198,10 @@ static inline int atomic_sub_return(int i, atomic_t *v) * Atomically reads the value of @v. * Doesn't imply a read memory barrier. */ -#define atomic64_read(v) ((v)->counter) +static inline long atomic64_read(const atomic64_t *v) +{ + return v->counter; +} /** * atomic64_set - set atomic64 variable @@ -201,7 +210,10 @@ static inline int atomic_sub_return(int i, atomic_t *v) * * Atomically sets the value of @v to @i. */ -#define atomic64_set(v, i) (((v)->counter) = (i)) +static inline void atomic64_set(atomic64_t *v, long i) +{ + v->counter = i; +} /** * atomic64_add - add integer to atomic64 variable @@ -355,11 +367,25 @@ static inline long atomic64_sub_return(long i, atomic64_t *v) #define atomic64_inc_return(v) (atomic64_add_return(1, (v))) #define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) -#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) -#define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) +static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline long atomic64_xchg(atomic64_t *v, long new) +{ + return xchg(&v->counter, new); +} -#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) -#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) +static inline long atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline long atomic_xchg(atomic_t *v, int new) +{ + return xchg(&v->counter, new); +} /** * atomic_add_unless - add unless the number is a given value @@ -455,5 +481,5 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -#include <asm-generic/atomic.h> +#include <asm-generic/atomic-long.h> #endif /* _ASM_X86_ATOMIC_64_H */ diff --git a/arch/x86/include/asm/bitsperlong.h b/arch/x86/include/asm/bitsperlong.h new file mode 100644 index 00000000000..b0ae1c4dc79 --- /dev/null +++ b/arch/x86/include/asm/bitsperlong.h @@ -0,0 +1,13 @@ +#ifndef __ASM_X86_BITSPERLONG_H +#define __ASM_X86_BITSPERLONG_H + +#ifdef __x86_64__ +# define __BITS_PER_LONG 64 +#else +# define __BITS_PER_LONG 32 +#endif + +#include <asm-generic/bitsperlong.h> + +#endif /* __ASM_X86_BITSPERLONG_H */ + diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 6ba23dd9fc9..7a1065958ba 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -8,11 +8,26 @@ #ifdef __KERNEL__ +#include <asm/pgtable_types.h> + /* Physical address where kernel should be loaded. */ #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ + (CONFIG_PHYSICAL_ALIGN - 1)) \ & ~(CONFIG_PHYSICAL_ALIGN - 1)) +/* Minimum kernel alignment, as a power of two */ +#ifdef CONFIG_X86_64 +#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT +#else +#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_ORDER) +#endif +#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) + +#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \ + (CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)) +#error "Invalid value for CONFIG_PHYSICAL_ALIGN" +#endif + #ifdef CONFIG_KERNEL_BZIP2 #define BOOT_HEAP_SIZE 0x400000 #else /* !CONFIG_KERNEL_BZIP2 */ diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 433adaebf9b..1724e8de317 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -50,7 +50,8 @@ struct setup_header { __u32 ramdisk_size; __u32 bootsect_kludge; __u16 heap_end_ptr; - __u16 _pad1; + __u8 ext_loader_ver; + __u8 ext_loader_type; __u32 cmd_line_ptr; __u32 initrd_addr_max; __u32 kernel_alignment; diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h index 222802029fa..d96c1ee3a95 100644 --- a/arch/x86/include/asm/cpu_debug.h +++ b/arch/x86/include/asm/cpu_debug.h @@ -86,105 +86,7 @@ enum cpu_file_bit { CPU_VALUE_BIT, /* value */ }; -#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT) - -/* - * DisplayFamily_DisplayModel Processor Families/Processor Number Series - * -------------------------- ------------------------------------------ - * 05_01, 05_02, 05_04 Pentium, Pentium with MMX - * - * 06_01 Pentium Pro - * 06_03, 06_05 Pentium II Xeon, Pentium II - * 06_07, 06_08, 06_0A, 06_0B Pentium III Xeon, Pentum III - * - * 06_09, 060D Pentium M - * - * 06_0E Core Duo, Core Solo - * - * 06_0F Xeon 3000, 3200, 5100, 5300, 7300 series, - * Core 2 Quad, Core 2 Extreme, Core 2 Duo, - * Pentium dual-core - * 06_17 Xeon 5200, 5400 series, Core 2 Quad Q9650 - * - * 06_1C Atom - * - * 0F_00, 0F_01, 0F_02 Xeon, Xeon MP, Pentium 4 - * 0F_03, 0F_04 Xeon, Xeon MP, Pentium 4, Pentium D - * - * 0F_06 Xeon 7100, 5000 Series, Xeon MP, - * Pentium 4, Pentium D - */ - -/* Register processors bits */ -enum cpu_processor_bit { - CPU_NONE, -/* Intel */ - CPU_INTEL_PENTIUM_BIT, - CPU_INTEL_P6_BIT, - CPU_INTEL_PENTIUM_M_BIT, - CPU_INTEL_CORE_BIT, - CPU_INTEL_CORE2_BIT, - CPU_INTEL_ATOM_BIT, - CPU_INTEL_XEON_P4_BIT, - CPU_INTEL_XEON_MP_BIT, -/* AMD */ - CPU_AMD_K6_BIT, - CPU_AMD_K7_BIT, - CPU_AMD_K8_BIT, - CPU_AMD_0F_BIT, - CPU_AMD_10_BIT, - CPU_AMD_11_BIT, -}; - -#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT) -#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT) -#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT) -#define CPU_INTEL_CORE (1 << CPU_INTEL_CORE_BIT) -#define CPU_INTEL_CORE2 (1 << CPU_INTEL_CORE2_BIT) -#define CPU_INTEL_ATOM (1 << CPU_INTEL_ATOM_BIT) -#define CPU_INTEL_XEON_P4 (1 << CPU_INTEL_XEON_P4_BIT) -#define CPU_INTEL_XEON_MP (1 << CPU_INTEL_XEON_MP_BIT) - -#define CPU_INTEL_PX (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M) -#define CPU_INTEL_COREX (CPU_INTEL_CORE | CPU_INTEL_CORE2) -#define CPU_INTEL_XEON (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP) -#define CPU_CO_AT (CPU_INTEL_CORE | CPU_INTEL_ATOM) -#define CPU_C2_AT (CPU_INTEL_CORE2 | CPU_INTEL_ATOM) -#define CPU_CX_AT (CPU_INTEL_COREX | CPU_INTEL_ATOM) -#define CPU_CX_XE (CPU_INTEL_COREX | CPU_INTEL_XEON) -#define CPU_P6_XE (CPU_INTEL_P6 | CPU_INTEL_XEON) -#define CPU_PM_CO_AT (CPU_INTEL_PENTIUM_M | CPU_CO_AT) -#define CPU_C2_AT_XE (CPU_C2_AT | CPU_INTEL_XEON) -#define CPU_CX_AT_XE (CPU_CX_AT | CPU_INTEL_XEON) -#define CPU_P6_CX_AT (CPU_INTEL_P6 | CPU_CX_AT) -#define CPU_P6_CX_XE (CPU_P6_XE | CPU_INTEL_COREX) -#define CPU_P6_CX_AT_XE (CPU_INTEL_P6 | CPU_CX_AT_XE) -#define CPU_PM_CX_AT_XE (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE) -#define CPU_PM_CX_AT (CPU_INTEL_PENTIUM_M | CPU_CX_AT) -#define CPU_PM_CX_XE (CPU_INTEL_PENTIUM_M | CPU_CX_XE) -#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT) -#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE) - -/* Select all supported Intel CPUs */ -#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE) - -#define CPU_AMD_K6 (1 << CPU_AMD_K6_BIT) -#define CPU_AMD_K7 (1 << CPU_AMD_K7_BIT) -#define CPU_AMD_K8 (1 << CPU_AMD_K8_BIT) -#define CPU_AMD_0F (1 << CPU_AMD_0F_BIT) -#define CPU_AMD_10 (1 << CPU_AMD_10_BIT) -#define CPU_AMD_11 (1 << CPU_AMD_11_BIT) - -#define CPU_K10_PLUS (CPU_AMD_10 | CPU_AMD_11) -#define CPU_K0F_PLUS (CPU_AMD_0F | CPU_K10_PLUS) -#define CPU_K8_PLUS (CPU_AMD_K8 | CPU_K0F_PLUS) -#define CPU_K7_PLUS (CPU_AMD_K7 | CPU_K8_PLUS) - -/* Select all supported AMD CPUs */ -#define CPU_AMD_ALL (CPU_AMD_K6 | CPU_K7_PLUS) - -/* Select all supported CPUs */ -#define CPU_ALL (CPU_INTEL_ALL | CPU_AMD_ALL) +#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT) #define MAX_CPU_FILES 512 @@ -220,7 +122,6 @@ struct cpu_debug_range { unsigned min; /* Register range min */ unsigned max; /* Register range max */ unsigned flag; /* Supported flags */ - unsigned model; /* Supported models */ }; #endif /* _ASM_X86_CPU_DEBUG_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index bb83b1c397a..4a28d22d479 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -22,7 +22,7 @@ #define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */ #define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */ #define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Architecture */ +#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */ #define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */ #define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */ #define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */ @@ -94,6 +94,7 @@ #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ #define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ #define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ +#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ @@ -115,6 +116,8 @@ #define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ #define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ #define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ +#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */ +#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ #define X86_FEATURE_AES (4*32+25) /* AES instructions */ #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ @@ -192,11 +195,11 @@ extern const char * const x86_power_flags[32]; #define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) #define setup_clear_cpu_cap(bit) do { \ clear_cpu_cap(&boot_cpu_data, bit); \ - set_bit(bit, (unsigned long *)cleared_cpu_caps); \ + set_bit(bit, (unsigned long *)cpu_caps_cleared); \ } while (0) #define setup_force_cpu_cap(bit) do { \ set_cpu_cap(&boot_cpu_data, bit); \ - clear_bit(bit, (unsigned long *)cleared_cpu_caps); \ + set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) #define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index c45f415ce31..c993e9e0fed 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_DESC_H #define _ASM_X86_DESC_H -#ifndef __ASSEMBLY__ #include <asm/desc_defs.h> #include <asm/ldt.h> #include <asm/mmu.h> @@ -380,29 +379,4 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); } -#else -/* - * GET_DESC_BASE reads the descriptor base of the specified segment. - * - * Args: - * idx - descriptor index - * gdt - GDT pointer - * base - 32bit register to which the base will be written - * lo_w - lo word of the "base" register - * lo_b - lo byte of the "base" register - * hi_b - hi byte of the low word of the "base" register - * - * Example: - * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) - * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax. - */ -#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \ - movb idx * 8 + 4(gdt), lo_b; \ - movb idx * 8 + 7(gdt), hi_b; \ - shll $16, base; \ - movw idx * 8 + 2(gdt), lo_w; - - -#endif /* __ASSEMBLY__ */ - #endif /* _ASM_X86_DESC_H */ diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index f82fdc412c6..1c3f9435f1c 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -6,6 +6,7 @@ * Documentation/DMA-API.txt for documentation. */ +#include <linux/kmemcheck.h> #include <linux/scatterlist.h> #include <linux/dma-debug.h> #include <linux/dma-attrs.h> @@ -32,6 +33,8 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) #endif } +#include <asm-generic/dma-mapping-common.h> + /* Make sure we keep the same behaviour */ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { @@ -52,171 +55,6 @@ extern int dma_set_mask(struct device *dev, u64 mask); extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag); -static inline dma_addr_t -dma_map_single(struct device *hwdev, void *ptr, size_t size, - enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(hwdev); - dma_addr_t addr; - - BUG_ON(!valid_dma_direction(dir)); - addr = ops->map_page(hwdev, virt_to_page(ptr), - (unsigned long)ptr & ~PAGE_MASK, size, - dir, NULL); - debug_dma_map_page(hwdev, virt_to_page(ptr), - (unsigned long)ptr & ~PAGE_MASK, size, - dir, addr, true); - return addr; -} - -static inline void -dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size, - enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->unmap_page) - ops->unmap_page(dev, addr, size, dir, NULL); - debug_dma_unmap_page(dev, addr, size, dir, true); -} - -static inline int -dma_map_sg(struct device *hwdev, struct scatterlist *sg, - int nents, enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(hwdev); - int ents; - - BUG_ON(!valid_dma_direction(dir)); - ents = ops->map_sg(hwdev, sg, nents, dir, NULL); - debug_dma_map_sg(hwdev, sg, nents, ents, dir); - - return ents; -} - -static inline void -dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, - enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(hwdev); - - BUG_ON(!valid_dma_direction(dir)); - debug_dma_unmap_sg(hwdev, sg, nents, dir); - if (ops->unmap_sg) - ops->unmap_sg(hwdev, sg, nents, dir, NULL); -} - -static inline void -dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(hwdev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_single_for_cpu) - ops->sync_single_for_cpu(hwdev, dma_handle, size, dir); - debug_dma_sync_single_for_cpu(hwdev, dma_handle, size, dir); - flush_write_buffers(); -} - -static inline void -dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(hwdev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_single_for_device) - ops->sync_single_for_device(hwdev, dma_handle, size, dir); - debug_dma_sync_single_for_device(hwdev, dma_handle, size, dir); - flush_write_buffers(); -} - -static inline void -dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(hwdev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_single_range_for_cpu) - ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, - size, dir); - debug_dma_sync_single_range_for_cpu(hwdev, dma_handle, - offset, size, dir); - flush_write_buffers(); -} - -static inline void -dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(hwdev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_single_range_for_device) - ops->sync_single_range_for_device(hwdev, dma_handle, - offset, size, dir); - debug_dma_sync_single_range_for_device(hwdev, dma_handle, - offset, size, dir); - flush_write_buffers(); -} - -static inline void -dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(hwdev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_sg_for_cpu) - ops->sync_sg_for_cpu(hwdev, sg, nelems, dir); - debug_dma_sync_sg_for_cpu(hwdev, sg, nelems, dir); - flush_write_buffers(); -} - -static inline void -dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(hwdev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_sg_for_device) - ops->sync_sg_for_device(hwdev, sg, nelems, dir); - debug_dma_sync_sg_for_device(hwdev, sg, nelems, dir); - - flush_write_buffers(); -} - -static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, - size_t offset, size_t size, - enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - dma_addr_t addr; - - BUG_ON(!valid_dma_direction(dir)); - addr = ops->map_page(dev, page, offset, size, dir, NULL); - debug_dma_map_page(dev, page, offset, size, dir, addr, false); - - return addr; -} - -static inline void dma_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->unmap_page) - ops->unmap_page(dev, addr, size, dir, NULL); - debug_dma_unmap_page(dev, addr, size, dir, false); -} - static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction dir) diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index a8f672ba100..70dac199b09 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -15,8 +15,8 @@ * - buffer allocation (memory accounting) * * - * Copyright (C) 2007-2008 Intel Corporation. - * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008 + * Copyright (C) 2007-2009 Intel Corporation. + * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009 */ #ifndef _ASM_X86_DS_H @@ -83,8 +83,10 @@ enum ds_feature { * The interrupt threshold is independent from the overflow callback * to allow users to use their own overflow interrupt handling mechanism. * - * task: the task to request recording for; - * NULL for per-cpu recording on the current cpu + * The function might sleep. + * + * task: the task to request recording for + * cpu: the cpu to request recording for * base: the base pointer for the (non-pageable) buffer; * size: the size of the provided buffer in bytes * ovfl: pointer to a function to be called on buffer overflow; @@ -93,19 +95,28 @@ enum ds_feature { * -1 if no interrupt threshold is requested. * flags: a bit-mask of the above flags */ -extern struct bts_tracer *ds_request_bts(struct task_struct *task, - void *base, size_t size, - bts_ovfl_callback_t ovfl, - size_t th, unsigned int flags); -extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, - void *base, size_t size, - pebs_ovfl_callback_t ovfl, - size_t th, unsigned int flags); +extern struct bts_tracer *ds_request_bts_task(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags); +extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags); +extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags); +extern struct pebs_tracer *ds_request_pebs_cpu(int cpu, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags); /* * Release BTS or PEBS resources * Suspend and resume BTS or PEBS tracing * + * Must be called with irq's enabled. + * * tracer: the tracer handle returned from ds_request_~() */ extern void ds_release_bts(struct bts_tracer *tracer); @@ -115,6 +126,28 @@ extern void ds_release_pebs(struct pebs_tracer *tracer); extern void ds_suspend_pebs(struct pebs_tracer *tracer); extern void ds_resume_pebs(struct pebs_tracer *tracer); +/* + * Release BTS or PEBS resources + * Suspend and resume BTS or PEBS tracing + * + * Cpu tracers must call this on the traced cpu. + * Task tracers must call ds_release_~_noirq() for themselves. + * + * May be called with irq's disabled. + * + * Returns 0 if successful; + * -EPERM if the cpu tracer does not trace the current cpu. + * -EPERM if the task tracer does not trace itself. + * + * tracer: the tracer handle returned from ds_request_~() + */ +extern int ds_release_bts_noirq(struct bts_tracer *tracer); +extern int ds_suspend_bts_noirq(struct bts_tracer *tracer); +extern int ds_resume_bts_noirq(struct bts_tracer *tracer); +extern int ds_release_pebs_noirq(struct pebs_tracer *tracer); +extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer); +extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer); + /* * The raw DS buffer state as it is used for BTS and PEBS recording. @@ -170,9 +203,9 @@ struct bts_struct { } lbr; /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */ struct { - __u64 jiffies; + __u64 clock; pid_t pid; - } timestamp; + } event; } variant; }; @@ -201,8 +234,12 @@ struct bts_trace { struct pebs_trace { struct ds_trace ds; - /* the PEBS reset value */ - unsigned long long reset_value; + /* the number of valid counters in the below array */ + unsigned int counters; + +#define MAX_PEBS_COUNTERS 4 + /* the counter reset value */ + unsigned long long counter_reset[MAX_PEBS_COUNTERS]; }; @@ -237,9 +274,11 @@ extern int ds_reset_pebs(struct pebs_tracer *tracer); * Returns 0 on success; -Eerrno on error * * tracer: the tracer handle returned from ds_request_pebs() + * counter: the index of the counter * value: the new counter reset value */ -extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value); +extern int ds_set_pebs_reset(struct pebs_tracer *tracer, + unsigned int counter, u64 value); /* * Initialization @@ -252,21 +291,12 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); */ extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); -/* - * Task clone/init and cleanup work - */ -extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father); -extern void ds_exit_thread(struct task_struct *tsk); - #else /* CONFIG_X86_DS */ struct cpuinfo_x86; static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} static inline void ds_switch_to(struct task_struct *prev, struct task_struct *next) {} -static inline void ds_copy_thread(struct task_struct *tsk, - struct task_struct *father) {} -static inline void ds_exit_thread(struct task_struct *tsk) {} #endif /* CONFIG_X86_DS */ #endif /* _ASM_X86_DS_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index edc90f23e70..8406ed7f992 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -33,7 +33,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ efi_call_virt(f, a1, a2, a3, a4, a5, a6) -#define efi_ioremap(addr, size) ioremap_cache(addr, size) +#define efi_ioremap(addr, size, type) ioremap_cache(addr, size) #else /* !CONFIG_X86_32 */ @@ -84,7 +84,8 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) -extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size); +extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, + u32 type); #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index c2e6bedaf25..ff8cbfa0785 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -14,6 +14,7 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) +BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0, smp_invalidate_interrupt) @@ -49,11 +50,19 @@ BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) #ifdef CONFIG_PERF_COUNTERS -BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) +BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) #endif -#ifdef CONFIG_X86_MCE_P4THERMAL +#ifdef CONFIG_X86_THERMAL_VECTOR BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) #endif +#ifdef CONFIG_X86_MCE_THRESHOLD +BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) +#endif + +#ifdef CONFIG_X86_NEW_MCE +BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR) +#endif + #endif diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 2d81af3974a..7b2d71df39a 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -111,12 +111,9 @@ enum fixed_addresses { #ifdef CONFIG_PARAVIRT FIX_PARAVIRT_BOOTMAP, #endif - FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */ - FIX_TEXT_POKE1, + FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ + FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ __end_of_permanent_fixed_addresses, -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - FIX_OHCI1394_BASE, -#endif /* * 256 temporary boot-time mappings, used by early_ioremap(), * before ioremap() is functional. @@ -129,6 +126,9 @@ enum fixed_addresses { FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - (__end_of_permanent_fixed_addresses & 255), FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif #ifdef CONFIG_X86_32 FIX_WP_TEST, #endif diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 37555e52f98..82e3e8f0104 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -13,6 +13,8 @@ typedef struct { unsigned int irq_spurious_count; #endif unsigned int generic_irqs; /* arch dependent */ + unsigned int apic_perf_irqs; + unsigned int apic_pending_irqs; #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; @@ -20,7 +22,7 @@ typedef struct { #endif #ifdef CONFIG_X86_MCE unsigned int irq_thermal_count; -# ifdef CONFIG_X86_64 +# ifdef CONFIG_X86_MCE_THRESHOLD unsigned int irq_threshold_count; # endif #endif diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b762ea49bd7..ba180d93b08 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -29,9 +29,12 @@ extern void apic_timer_interrupt(void); extern void generic_interrupt(void); extern void error_interrupt(void); +extern void perf_pending_interrupt(void); + extern void spurious_interrupt(void); extern void thermal_interrupt(void); extern void reschedule_interrupt(void); +extern void mce_self_interrupt(void); extern void invalidate_interrupt(void); extern void invalidate_interrupt0(void); @@ -44,6 +47,7 @@ extern void invalidate_interrupt6(void); extern void invalidate_interrupt7(void); extern void irq_move_cleanup_interrupt(void); +extern void reboot_interrupt(void); extern void threshold_interrupt(void); extern void call_function_interrupt(void); @@ -63,7 +67,26 @@ extern unsigned long io_apic_irqs; extern void init_VISWS_APIC_irqs(void); extern void setup_IO_APIC(void); extern void disable_IO_APIC(void); -extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); + +struct io_apic_irq_attr { + int ioapic; + int ioapic_pin; + int trigger; + int polarity; +}; + +static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, + int ioapic, int ioapic_pin, + int trigger, int polarity) +{ + irq_attr->ioapic = ioapic; + irq_attr->ioapic_pin = ioapic_pin; + irq_attr->trigger = trigger; + irq_attr->polarity = polarity; +} + +extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, + struct io_apic_irq_attr *irq_attr); extern void setup_ioapic_dest(void); extern void enable_IO_APIC(void); @@ -78,7 +101,11 @@ extern void eisa_set_level_irq(unsigned int irq); /* SMP */ extern void smp_apic_timer_interrupt(struct pt_regs *); extern void smp_spurious_interrupt(struct pt_regs *); +extern void smp_generic_interrupt(struct pt_regs *); extern void smp_error_interrupt(struct pt_regs *); +#ifdef CONFIG_X86_IO_APIC +extern asmlinkage void smp_irq_move_cleanup_interrupt(void); +#endif #ifdef CONFIG_SMP extern void smp_reschedule_interrupt(struct pt_regs *); extern void smp_call_function_interrupt(struct pt_regs *); diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 71c9e518398..175adf58dd4 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -67,7 +67,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) ".previous\n" _ASM_EXTABLE(1b, 3b) : [err] "=r" (err) -#if 0 /* See comment in __save_init_fpu() below. */ +#if 0 /* See comment in fxsave() below. */ : [fx] "r" (fx), "m" (*fx), "0" (0)); #else : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0)); @@ -75,14 +75,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) return err; } -static inline int restore_fpu_checking(struct task_struct *tsk) -{ - if (task_thread_info(tsk)->status & TS_XSAVE) - return xrstor_checking(&tsk->thread.xstate->xsave); - else - return fxrstor_checking(&tsk->thread.xstate->fxsave); -} - /* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed values. The kernel data segment can be sometimes 0 and sometimes @@ -120,7 +112,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) ".previous\n" _ASM_EXTABLE(1b, 3b) : [err] "=r" (err), "=m" (*fx) -#if 0 /* See comment in __fxsave_clear() below. */ +#if 0 /* See comment in fxsave() below. */ : [fx] "r" (fx), "0" (0)); #else : [fx] "cdaSDb" (fx), "0" (0)); @@ -185,12 +177,9 @@ static inline void tolerant_fwait(void) asm volatile("fnclex ; fwait"); } -static inline void restore_fpu(struct task_struct *tsk) +/* perform fxrstor iff the processor has extended states, otherwise frstor */ +static inline int fxrstor_checking(struct i387_fxsave_struct *fx) { - if (task_thread_info(tsk)->status & TS_XSAVE) { - xrstor_checking(&tsk->thread.xstate->xsave); - return; - } /* * The "nop" is needed to make the instructions the same * length. @@ -199,7 +188,9 @@ static inline void restore_fpu(struct task_struct *tsk) "nop ; frstor %1", "fxrstor %1", X86_FEATURE_FXSR, - "m" (tsk->thread.xstate->fxsave)); + "m" (*fx)); + + return 0; } /* We need a safe address that is cheap to find and that is already @@ -262,6 +253,14 @@ end: #endif /* CONFIG_X86_64 */ +static inline int restore_fpu_checking(struct task_struct *tsk) +{ + if (task_thread_info(tsk)->status & TS_XSAVE) + return xrstor_checking(&tsk->thread.xstate->xsave); + else + return fxrstor_checking(&tsk->thread.xstate->fxsave); +} + /* * Signal frame handlers... */ @@ -305,18 +304,18 @@ static inline void kernel_fpu_end(void) /* * Some instructions like VIA's padlock instructions generate a spurious * DNA fault but don't modify SSE registers. And these instructions - * get used from interrupt context aswell. To prevent these kernel instructions - * in interrupt context interact wrongly with other user/kernel fpu usage, we + * get used from interrupt context as well. To prevent these kernel instructions + * in interrupt context interacting wrongly with other user/kernel fpu usage, we * should use them only in the context of irq_ts_save/restore() */ static inline int irq_ts_save(void) { /* - * If we are in process context, we are ok to take a spurious DNA fault. - * Otherwise, doing clts() in process context require pre-emption to - * be disabled or some heavy lifting like kernel_fpu_begin() + * If in process context and not atomic, we can take a spurious DNA fault. + * Otherwise, doing clts() in process context requires disabling preemption + * or some heavy lifting like kernel_fpu_begin() */ - if (!in_interrupt()) + if (!in_atomic()) return 0; if (read_cr0() & X86_CR0_TS) { diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 1a99e6c092a..58d7091eeb1 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -60,8 +60,4 @@ extern struct irq_chip i8259A_chip; extern void mask_8259A(void); extern void unmask_8259A(void); -#ifdef CONFIG_X86_32 -extern void init_ISA_irqs(void); -#endif - #endif /* _ASM_X86_I8259_H */ diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h deleted file mode 100644 index fa0fd068bc2..00000000000 --- a/arch/x86/include/asm/intel_arch_perfmon.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H -#define _ASM_X86_INTEL_ARCH_PERFMON_H - -#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 -#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 - -#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 -#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 - -#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) -#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) -#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) -#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) - -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ - (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) - -union cpuid10_eax { - struct { - unsigned int version_id:8; - unsigned int num_counters:8; - unsigned int bit_width:8; - unsigned int mask_length:8; - } split; - unsigned int full; -}; - -#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 9d826e43601..330ee807f89 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -154,22 +154,20 @@ extern int timer_through_8259; extern int io_apic_get_unique_id(int ioapic, int apic_id); extern int io_apic_get_version(int ioapic); extern int io_apic_get_redir_entries(int ioapic); -extern int io_apic_set_pci_routing(int ioapic, int pin, int irq, - int edge_level, int active_high_low); #endif /* CONFIG_ACPI */ +struct io_apic_irq_attr; +extern int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr); extern int (*ioapic_renumber_irq)(int ioapic, int irq); extern void ioapic_init_mappings(void); +extern void ioapic_insert_resources(void); -#ifdef CONFIG_X86_64 extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); -extern void reinit_intr_remapped_IO_APIC(int intr_remapping, - struct IO_APIC_route_entry **ioapic_entries); -#endif extern void probe_nr_irqs_gsi(void); @@ -183,6 +181,7 @@ extern void ioapic_write_entry(int apic, int pin, #define io_apic_assign_pci_irqs 0 static const int timer_through_8259 = 0; static inline void ioapic_init_mappings(void) { } +static inline void ioapic_insert_resources(void) { } static inline void probe_nr_irqs_gsi(void) { } #endif diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index 86af26091d6..0e9fe1d9d97 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -1,3 +1,6 @@ +#ifndef _ASM_X86_IOMAP_H +#define _ASM_X86_IOMAP_H + /* * Copyright © 2008 Ingo Molnar * @@ -31,3 +34,5 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); void iounmap_atomic(void *kvaddr, enum km_type type); + +#endif /* _ASM_X86_IOMAP_H */ diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index af326a2975b..fd6d21bbee6 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -6,6 +6,7 @@ extern void no_iommu_init(void); extern struct dma_map_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; +extern int iommu_pass_through; /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 0396760fccb..f275e224450 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -1,6 +1,6 @@ #ifndef _ASM_X86_IRQ_REMAPPING_H #define _ASM_X86_IRQ_REMAPPING_H -#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8) +#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8) #endif /* _ASM_X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 3cbd79bbb47..5b21f0ec3df 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -25,6 +25,7 @@ */ #define NMI_VECTOR 0x02 +#define MCE_VECTOR 0x12 /* * IDT vectors usable for external interrupt sources start @@ -34,6 +35,7 @@ #ifdef CONFIG_X86_32 # define SYSCALL_VECTOR 0x80 +# define IA32_SYSCALL_VECTOR 0x80 #else # define IA32_SYSCALL_VECTOR 0x80 #endif @@ -86,13 +88,8 @@ #define CALL_FUNCTION_VECTOR 0xfc #define CALL_FUNCTION_SINGLE_VECTOR 0xfb #define THERMAL_APIC_VECTOR 0xfa - -#ifdef CONFIG_X86_32 -/* 0xf8 - 0xf9 : free */ -#else -# define THRESHOLD_APIC_VECTOR 0xf9 -# define UV_BAU_MESSAGE 0xf8 -#endif +#define THRESHOLD_APIC_VECTOR 0xf9 +#define REBOOT_VECTOR 0xf8 /* f0-f7 used for spreading out TLB flushes: */ #define INVALIDATE_TLB_VECTOR_END 0xf7 @@ -107,14 +104,21 @@ #define LOCAL_TIMER_VECTOR 0xef /* - * Performance monitoring interrupt vector: + * Generic system vector for platform specific use */ -#define LOCAL_PERF_VECTOR 0xee +#define GENERIC_INTERRUPT_VECTOR 0xed /* - * Generic system vector for platform specific use + * Performance monitoring pending work vector: */ -#define GENERIC_INTERRUPT_VECTOR 0xed +#define LOCAL_PENDING_VECTOR 0xec + +#define UV_BAU_MESSAGE 0xec + +/* + * Self IPI vector for machine checks + */ +#define MCE_SELF_VECTOR 0xeb /* * First APIC vector available to drivers: (vectors 0x30-0xee) we diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 2bdab21f089..c6ccbe7e81a 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -12,9 +12,15 @@ static inline unsigned long native_save_fl(void) { unsigned long flags; + /* + * Note: this needs to be "=r" not "=rm", because we have the + * stack offset from what gcc expects at the time the "pop" is + * executed, and so a memory reference with respect to the stack + * would end up using the wrong address. + */ asm volatile("# __raw_save_flags\n\t" "pushf ; pop %0" - : "=g" (flags) + : "=r" (flags) : /* no input */ : "memory"); diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h index 54c8cc53b24..c2d1f3b58e5 100644 --- a/arch/x86/include/asm/k8.h +++ b/arch/x86/include/asm/k8.h @@ -12,4 +12,17 @@ extern int cache_k8_northbridges(void); extern void k8_flush_garts(void); extern int k8_scan_nodes(unsigned long start, unsigned long end); +#ifdef CONFIG_K8_NB +static inline struct pci_dev *node_to_k8_nb_misc(int node) +{ + return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL; +} +#else +static inline struct pci_dev *node_to_k8_nb_misc(int node) +{ + return NULL; +} +#endif + + #endif /* _ASM_X86_K8_H */ diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h index 5759c165a5c..9e00a731a7f 100644 --- a/arch/x86/include/asm/kmap_types.h +++ b/arch/x86/include/asm/kmap_types.h @@ -2,28 +2,11 @@ #define _ASM_X86_KMAP_TYPES_H #if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM) -# define D(n) __KM_FENCE_##n , -#else -# define D(n) +#define __WITH_KM_FENCE #endif -enum km_type { -D(0) KM_BOUNCE_READ, -D(1) KM_SKB_SUNRPC_DATA, -D(2) KM_SKB_DATA_SOFTIRQ, -D(3) KM_USER0, -D(4) KM_USER1, -D(5) KM_BIO_SRC_IRQ, -D(6) KM_BIO_DST_IRQ, -D(7) KM_PTE0, -D(8) KM_PTE1, -D(9) KM_IRQ0, -D(10) KM_IRQ1, -D(11) KM_SOFTIRQ0, -D(12) KM_SOFTIRQ1, -D(13) KM_TYPE_NR -}; +#include <asm-generic/kmap_types.h> -#undef D +#undef __WITH_KM_FENCE #endif /* _ASM_X86_KMAP_TYPES_H */ diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h new file mode 100644 index 00000000000..ed01518f297 --- /dev/null +++ b/arch/x86/include/asm/kmemcheck.h @@ -0,0 +1,42 @@ +#ifndef ASM_X86_KMEMCHECK_H +#define ASM_X86_KMEMCHECK_H + +#include <linux/types.h> +#include <asm/ptrace.h> + +#ifdef CONFIG_KMEMCHECK +bool kmemcheck_active(struct pt_regs *regs); + +void kmemcheck_show(struct pt_regs *regs); +void kmemcheck_hide(struct pt_regs *regs); + +bool kmemcheck_fault(struct pt_regs *regs, + unsigned long address, unsigned long error_code); +bool kmemcheck_trap(struct pt_regs *regs); +#else +static inline bool kmemcheck_active(struct pt_regs *regs) +{ + return false; +} + +static inline void kmemcheck_show(struct pt_regs *regs) +{ +} + +static inline void kmemcheck_hide(struct pt_regs *regs) +{ +} + +static inline bool kmemcheck_fault(struct pt_regs *regs, + unsigned long address, unsigned long error_code) +{ + return false; +} + +static inline bool kmemcheck_trap(struct pt_regs *regs) +{ + return false; +} +#endif /* CONFIG_KMEMCHECK */ + +#endif diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index dc3f6cf1170..125be8b1956 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -16,6 +16,7 @@ #define __KVM_HAVE_MSI #define __KVM_HAVE_USER_NMI #define __KVM_HAVE_GUEST_DEBUG +#define __KVM_HAVE_MSIX /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f0faf58044f..eabdc1cfab5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -185,6 +185,7 @@ union kvm_mmu_page_role { unsigned access:3; unsigned invalid:1; unsigned cr4_pge:1; + unsigned nxe:1; }; }; @@ -212,7 +213,6 @@ struct kvm_mmu_page { int multimapped; /* More than one parent_pte? */ int root_count; /* Currently serving as active root */ bool unsync; - bool global; unsigned int unsync_children; union { u64 *parent_pte; /* !multimapped */ @@ -261,13 +261,11 @@ struct kvm_mmu { union kvm_mmu_page_role base_role; u64 *pae_root; + u64 rsvd_bits_mask[2][4]; }; struct kvm_vcpu_arch { u64 host_tsc; - int interrupt_window_open; - unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ - DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); /* * rip and regs accesses must go through * kvm_{register,rip}_{read,write} functions. @@ -286,6 +284,7 @@ struct kvm_vcpu_arch { u64 shadow_efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ + int32_t apic_arb_prio; int mp_state; int sipi_vector; u64 ia32_misc_enable_msr; @@ -320,6 +319,8 @@ struct kvm_vcpu_arch { struct kvm_pio_request pio; void *pio_data; + u8 event_exit_inst_len; + struct kvm_queued_exception { bool pending; bool has_error_code; @@ -329,11 +330,12 @@ struct kvm_vcpu_arch { struct kvm_queued_interrupt { bool pending; + bool soft; u8 nr; } interrupt; struct { - int active; + int vm86_active; u8 save_iopl; struct kvm_save_segment { u16 selector; @@ -356,9 +358,9 @@ struct kvm_vcpu_arch { unsigned int time_offset; struct page *time_page; + bool singlestep; /* guest is single stepped by KVM */ bool nmi_pending; bool nmi_injected; - bool nmi_window_open; struct mtrr_state_type mtrr_state; u32 pat; @@ -392,15 +394,14 @@ struct kvm_arch{ */ struct list_head active_mmu_pages; struct list_head assigned_dev_head; - struct list_head oos_global_pages; struct iommu_domain *iommu_domain; + int iommu_flags; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; struct hlist_head irq_ack_notifier_list; int vapics_in_nmi_mode; - int round_robin_prev_vcpu; unsigned int tss_addr; struct page *apic_access_page; @@ -423,7 +424,6 @@ struct kvm_vm_stat { u32 mmu_recycled; u32 mmu_cache_miss; u32 mmu_unsync; - u32 mmu_unsync_global; u32 remote_tlb_flush; u32 lpages; }; @@ -443,7 +443,6 @@ struct kvm_vcpu_stat { u32 halt_exits; u32 halt_wakeup; u32 request_irq_exits; - u32 request_nmi_exits; u32 irq_exits; u32 host_state_reload; u32 efer_reload; @@ -511,20 +510,22 @@ struct kvm_x86_ops { void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); + void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); + u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); - int (*get_irq)(struct kvm_vcpu *vcpu); - void (*set_irq)(struct kvm_vcpu *vcpu, int vec); + void (*set_irq)(struct kvm_vcpu *vcpu); + void (*set_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, bool has_error_code, u32 error_code); - bool (*exception_injected)(struct kvm_vcpu *vcpu); - void (*inject_pending_irq)(struct kvm_vcpu *vcpu); - void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, - struct kvm_run *run); - + int (*interrupt_allowed)(struct kvm_vcpu *vcpu); + int (*nmi_allowed)(struct kvm_vcpu *vcpu); + void (*enable_nmi_window)(struct kvm_vcpu *vcpu); + void (*enable_irq_window)(struct kvm_vcpu *vcpu); + void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); - int (*get_mt_mask_shift)(void); + u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); }; extern struct kvm_x86_ops *kvm_x86_ops; @@ -538,7 +539,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); void kvm_mmu_set_base_ptes(u64 base_pte); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask); + u64 dirty_mask, u64 nx_mask, u64 x_mask); int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); @@ -552,6 +553,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, gpa_t addr, unsigned long *ret); +u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); extern bool tdp_enabled; @@ -563,6 +565,7 @@ enum emulation_result { #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) +#define EMULTYPE_SKIP (1 << 2) int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, unsigned long cr2, u16 error_code, int emulation_type); void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); @@ -638,7 +641,6 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); -void kvm_mmu_sync_global(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); @@ -769,6 +771,8 @@ enum { #define HF_GIF_MASK (1 << 0) #define HF_HIF_MASK (1 << 1) #define HF_VINTR_MASK (1 << 2) +#define HF_NMI_MASK (1 << 3) +#define HF_IRET_MASK (1 << 4) /* * Hardware virtualization extension instructions may fault if a @@ -791,5 +795,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void); #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); +int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h index 6a159732881..b7ed2c42311 100644 --- a/arch/x86/include/asm/kvm_x86_emulate.h +++ b/arch/x86/include/asm/kvm_x86_emulate.h @@ -143,6 +143,9 @@ struct decode_cache { struct fetch_cache fetch; }; +#define X86_SHADOW_INT_MOV_SS 1 +#define X86_SHADOW_INT_STI 2 + struct x86_emulate_ctxt { /* Register state before/after emulation. */ struct kvm_vcpu *vcpu; @@ -152,6 +155,9 @@ struct x86_emulate_ctxt { int mode; u32 cs_base; + /* interruptibility state, as a result of execution of STI or MOV SS */ + int interruptibility; + /* decode cache */ struct decode_cache decode; }; diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 1caf57628b9..5136dad57cb 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -17,8 +17,12 @@ /* Pages for switcher itself, then two pages per cpu */ #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) -/* We map at -4M for ease of mapping into the guest (one PTE page). */ +/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */ +#ifdef CONFIG_X86_PAE +#define SWITCHER_ADDR 0xFFE00000 +#else #define SWITCHER_ADDR 0xFFC00000 +#endif /* Found in switcher.S */ extern unsigned long default_idt_entries[]; diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index faae1996487..ba0eed8aa1a 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -12,11 +12,13 @@ #define LHCALL_TS 8 #define LHCALL_SET_CLOCKEVENT 9 #define LHCALL_HALT 10 +#define LHCALL_SET_PMD 13 #define LHCALL_SET_PTE 14 -#define LHCALL_SET_PMD 15 +#define LHCALL_SET_PGD 15 #define LHCALL_LOAD_TLS 16 #define LHCALL_NOTIFY 17 #define LHCALL_LOAD_GDT_ENTRY 18 +#define LHCALL_SEND_INTERRUPTS 19 #define LGUEST_TRAP_ENTRY 0x1F @@ -28,27 +30,28 @@ #include <asm/hw_irq.h> #include <asm/kvm_para.h> -/*G:031 But first, how does our Guest contact the Host to ask for privileged +/*G:030 + * But first, how does our Guest contact the Host to ask for privileged * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * - * We use the KVM hypercall mechanism. Eighteen hypercalls are - * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx and %edx. If a return - * value makes sense, it's returned in %eax. + * We use the KVM hypercall mechanism, though completely different hypercall + * numbers. Seventeen hypercalls are available: the hypercall number is put in + * the %eax register, and the arguments (when required) are placed in %ebx, + * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's - * definition of a gentleman: "someone who is only rude intentionally". */ -/*:*/ + * definition of a gentleman: "someone who is only rude intentionally". +:*/ /* Can't use our min() macro here: needs to be a constant */ #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) #define LHCALL_RING_SIZE 64 struct hcall_args { - /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ - unsigned long arg0, arg1, arg2, arg3; + /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ + unsigned long arg0, arg1, arg2, arg3, arg4; }; #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 4f8c199584e..5cdd8d100ec 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -1,8 +1,6 @@ #ifndef _ASM_X86_MCE_H #define _ASM_X86_MCE_H -#ifdef __x86_64__ - #include <linux/types.h> #include <asm/ioctls.h> @@ -10,21 +8,35 @@ * Machine Check support for x86 */ -#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ -#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ -#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ - -#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ -#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ -#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */ - -#define MCI_STATUS_VAL (1UL<<63) /* valid error */ -#define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */ -#define MCI_STATUS_UC (1UL<<61) /* uncorrected error */ -#define MCI_STATUS_EN (1UL<<60) /* error enabled */ -#define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */ -#define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */ -#define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */ +#define MCG_BANKCNT_MASK 0xff /* Number of Banks */ +#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ +#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ +#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ +#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ +#define MCG_EXT_CNT_SHIFT 16 +#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) +#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ + +#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ +#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ +#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ + +#define MCI_STATUS_VAL (1ULL<<63) /* valid error */ +#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */ +#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */ +#define MCI_STATUS_EN (1ULL<<60) /* error enabled */ +#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */ +#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */ +#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ +#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ +#define MCI_STATUS_AR (1ULL<<55) /* Action required */ + +/* MISC register defines */ +#define MCM_ADDR_SEGOFF 0 /* segment offset */ +#define MCM_ADDR_LINEAR 1 /* linear address */ +#define MCM_ADDR_PHYS 2 /* physical address */ +#define MCM_ADDR_MEM 3 /* memory address */ +#define MCM_ADDR_GENERIC 7 /* generic */ /* Fields are zero when not available */ struct mce { @@ -34,13 +46,19 @@ struct mce { __u64 mcgstatus; __u64 ip; __u64 tsc; /* cpu time stamp counter */ - __u64 res1; /* for future extension */ - __u64 res2; /* dito. */ + __u64 time; /* wall time_t when error was detected */ + __u8 cpuvendor; /* cpu vendor as encoded in system.h */ + __u8 pad1; + __u16 pad2; + __u32 cpuid; /* CPUID 1 EAX */ __u8 cs; /* code segment */ __u8 bank; /* machine check bank */ - __u8 cpu; /* cpu that raised the error */ + __u8 cpu; /* cpu number; obsolete; use extcpu now */ __u8 finished; /* entry is valid */ - __u32 pad; + __u32 extcpu; /* linux cpu number that detected the error */ + __u32 socketid; /* CPU socket ID */ + __u32 apicid; /* CPU initial apic ID */ + __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ }; /* @@ -57,7 +75,7 @@ struct mce_log { unsigned len; /* = MCE_LOG_LEN */ unsigned next; unsigned flags; - unsigned pad0; + unsigned recordlen; /* length of struct mce */ struct mce entry[MCE_LOG_LEN]; }; @@ -82,20 +100,41 @@ struct mce_log { #define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) #define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) -#endif /* __x86_64__ */ - #ifdef __KERNEL__ -#ifdef CONFIG_X86_32 +#include <linux/percpu.h> +#include <linux/init.h> +#include <asm/atomic.h> + extern int mce_disabled; -#else /* CONFIG_X86_32 */ +extern int mce_p5_enabled; -#include <asm/atomic.h> +#ifdef CONFIG_X86_MCE +void mcheck_init(struct cpuinfo_x86 *c); +#else +static inline void mcheck_init(struct cpuinfo_x86 *c) {} +#endif + +#ifdef CONFIG_X86_OLD_MCE +extern int nr_mce_banks; +void amd_mcheck_init(struct cpuinfo_x86 *c); +void intel_p4_mcheck_init(struct cpuinfo_x86 *c); +void intel_p6_mcheck_init(struct cpuinfo_x86 *c); +#endif + +#ifdef CONFIG_X86_ANCIENT_MCE +void intel_p5_mcheck_init(struct cpuinfo_x86 *c); +void winchip_mcheck_init(struct cpuinfo_x86 *c); +static inline void enable_p5_mce(void) { mce_p5_enabled = 1; } +#else +static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} +static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} +static inline void enable_p5_mce(void) {} +#endif void mce_setup(struct mce *m); void mce_log(struct mce *m); -DECLARE_PER_CPU(struct sys_device, device_mce); -extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); +DECLARE_PER_CPU(struct sys_device, mce_dev); /* * To support more than 128 would need to escape the predefined @@ -104,6 +143,8 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); #define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) #ifdef CONFIG_X86_MCE_INTEL +extern int mce_cmci_disabled; +extern int mce_ignore_ce; void mce_intel_feature_init(struct cpuinfo_x86 *c); void cmci_clear(void); void cmci_reenable(void); @@ -123,14 +164,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c); static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif -extern int mce_available(struct cpuinfo_x86 *c); +int mce_available(struct cpuinfo_x86 *c); -void mce_log_therm_throt_event(__u64 status); +DECLARE_PER_CPU(unsigned, mce_exception_count); +DECLARE_PER_CPU(unsigned, mce_poll_count); extern atomic_t mce_entry; -extern void do_machine_check(struct pt_regs *, long); - typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); @@ -139,19 +179,40 @@ enum mcp_flags { MCP_UC = (1 << 1), /* log uncorrected errors */ MCP_DONTLOG = (1 << 2), /* only clear, don't log */ }; -extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); -extern int mce_notify_user(void); +int mce_notify_irq(void); +void mce_notify_process(void); -#endif /* !CONFIG_X86_32 */ +DECLARE_PER_CPU(struct mce, injectm); +extern struct file_operations mce_chrdev_ops; -#ifdef CONFIG_X86_MCE -extern void mcheck_init(struct cpuinfo_x86 *c); -#else -#define mcheck_init(c) do { } while (0) -#endif +/* + * Exception handler + */ + +/* Call the installed machine check handler for this CPU setup. */ +extern void (*machine_check_vector)(struct pt_regs *, long error_code); +void do_machine_check(struct pt_regs *, long); + +/* + * Threshold handler + */ extern void (*mce_threshold_vector)(void); +extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); + +/* + * Thermal handler + */ + +void intel_init_thermal(struct cpuinfo_x86 *c); + +#ifdef CONFIG_X86_NEW_MCE +void mce_log_therm_throt_event(__u64 status); +#else +static inline void mce_log_therm_throt_event(__u64 status) {} +#endif #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index c882664716c..ef51b501e22 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -9,20 +9,31 @@ struct cpu_signature { struct device; +enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; + struct microcode_ops { - int (*request_microcode_user) (int cpu, const void __user *buf, size_t size); - int (*request_microcode_fw) (int cpu, struct device *device); + enum ucode_state (*request_microcode_user) (int cpu, + const void __user *buf, size_t size); - void (*apply_microcode) (int cpu); + enum ucode_state (*request_microcode_fw) (int cpu, + struct device *device); - int (*collect_cpu_info) (int cpu, struct cpu_signature *csig); void (*microcode_fini_cpu) (int cpu); + + /* + * The generic 'microcode_core' part guarantees that + * the callbacks below run on a target cpu when they + * are being called. + * See also the "Synchronization" section in microcode_core.c. + */ + int (*apply_microcode) (int cpu); + int (*collect_cpu_info) (int cpu, struct cpu_signature *csig); }; struct ucode_cpu_info { - struct cpu_signature cpu_sig; - int valid; - void *mc; + struct cpu_signature cpu_sig; + int valid; + void *mc; }; extern struct ucode_cpu_info ucode_cpu_info[]; diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h index 90bc4108a4f..751af2550ed 100644 --- a/arch/x86/include/asm/mman.h +++ b/arch/x86/include/asm/mman.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_MMAN_H #define _ASM_X86_MMAN_H -#include <asm-generic/mman.h> +#include <asm-generic/mman-common.h> #define MAP_32BIT 0x40 /* only give out 32bit addresses */ diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 642fc7fc8cd..e2a1bb6d71e 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -61,9 +61,11 @@ extern void get_smp_config(void); #ifdef CONFIG_X86_MPPARSE extern void find_smp_config(void); extern void early_reserve_e820_mpc_new(void); +extern int enable_update_mptable; #else static inline void find_smp_config(void) { } static inline void early_reserve_e820_mpc_new(void) { } +#define enable_update_mptable 0 #endif void __cpuinit generic_processor_info(int apicid, int version); @@ -72,20 +74,13 @@ extern void mp_register_ioapic(int id, u32 address, u32 gsi_base); extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi); extern void mp_config_acpi_legacy_irqs(void); -extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low); +struct device; +extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level, + int active_high_low); extern int acpi_probe_gsi(void); #ifdef CONFIG_X86_IO_APIC -extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, - u32 gsi, int triggering, int polarity); extern int mp_find_ioapic(int gsi); extern int mp_find_ioapic_pin(int ioapic, int gsi); -#else -static inline int -mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, - u32 gsi, int triggering, int polarity) -{ - return 0; -} #endif #else /* !CONFIG_ACPI: */ static inline int acpi_probe_gsi(void) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ec41fc16c16..6be7fc254b5 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -121,7 +121,6 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_K8_SYSCFG 0xc0010010 -#define MSR_K8_HWCR 0xc0010015 #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 @@ -208,7 +207,14 @@ #define MSR_IA32_THERM_CONTROL 0x0000019a #define MSR_IA32_THERM_INTERRUPT 0x0000019b + +#define THERM_INT_LOW_ENABLE (1 << 0) +#define THERM_INT_HIGH_ENABLE (1 << 1) + #define MSR_IA32_THERM_STATUS 0x0000019c + +#define THERM_STATUS_PROCHOT (1 << 0) + #define MSR_IA32_MISC_ENABLE 0x000001a0 /* MISC_ENABLE bits: architectural */ @@ -240,10 +246,6 @@ #define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38) #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39) -/* Intel Model 6 */ -#define MSR_P6_EVNTSEL0 0x00000186 -#define MSR_P6_EVNTSEL1 0x00000187 - /* P4/Xeon+ specific */ #define MSR_IA32_MCG_EAX 0x00000180 #define MSR_IA32_MCG_EBX 0x00000181 diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 638bf624180..48ad9d29484 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -3,15 +3,23 @@ #include <asm/msr-index.h> -#ifndef __ASSEMBLY__ -# include <linux/types.h> -#endif - #ifdef __KERNEL__ #ifndef __ASSEMBLY__ +#include <linux/types.h> #include <asm/asm.h> #include <asm/errno.h> +#include <asm/cpumask.h> + +struct msr { + union { + struct { + u32 l; + u32 h; + }; + u64 q; + }; +}; static inline unsigned long long native_read_tscp(unsigned int *aux) { @@ -216,6 +224,8 @@ do { \ #ifdef CONFIG_SMP int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); +void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); +void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); #else /* CONFIG_SMP */ @@ -229,6 +239,16 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) wrmsr(msr_no, l, h); return 0; } +static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no, + struct msr *msrs) +{ + rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); +} +static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no, + struct msr *msrs) +{ + wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); +} static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { @@ -241,6 +261,4 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) #endif /* CONFIG_SMP */ #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ - - #endif /* _ASM_X86_MSR_H */ diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index c45a0a568df..c86e5ed4af5 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -64,7 +64,7 @@ static inline int nmi_watchdog_active(void) * but since they are power of two we could use a * cheaper way --cvg */ - return nmi_watchdog & 0x3; + return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC); } #endif @@ -72,7 +72,6 @@ void lapic_watchdog_stop(void); int lapic_watchdog_init(unsigned nmi_hz); int lapic_wd_event(unsigned nmi_hz); unsigned lapic_adjust_nmi_hz(unsigned hz); -int lapic_watchdog_ok(void); void disable_lapic_nmi_watchdog(void); void enable_lapic_nmi_watchdog(void); void stop_nmi(void); diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 064ed6df4cb..c4ae822e415 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -17,9 +17,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks, extern void numa_init_array(void); extern int numa_off; -extern void srat_reserve_add_area(int nodeid); -extern int hotadd_percent; - extern s16 apicid_to_node[MAX_LOCAL_APIC]; extern unsigned long numa_free_all_bootmem(void); @@ -27,6 +24,13 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end); #ifdef CONFIG_NUMA +/* + * Too small node sizes may confuse the VM badly. Usually they + * result from BIOS bugs. So dont recognize nodes as standalone + * NUMA entities that have less than this amount of RAM listed: + */ +#define NODE_MIN_SIZE (4*1024*1024) + extern void __init init_cpu_to_node(void); extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 89ed9d70b0a..625c3f0e741 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -56,7 +56,7 @@ extern bool __virt_addr_valid(unsigned long kaddr); #endif /* __ASSEMBLY__ */ #include <asm-generic/memory_model.h> -#include <asm-generic/page.h> +#include <asm-generic/getorder.h> #define __HAVE_ARCH_GATE_AREA 1 diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 0f915ae649a..6f1b7331313 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -54,10 +54,6 @@ extern unsigned int __VMALLOC_RESERVE; extern int sysctl_legacy_va_layout; extern void find_low_pfn_range(void); -extern unsigned long init_memory_mapping(unsigned long start, - unsigned long end); -extern void initmem_init(unsigned long, unsigned long); -extern void free_initmem(void); extern void setup_bootmem_allocator(void); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index d38c91b7024..7639dbf5d22 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -32,24 +32,16 @@ */ #define __PAGE_OFFSET _AC(0xffff880000000000, UL) -#define __PHYSICAL_START CONFIG_PHYSICAL_START -#define __KERNEL_ALIGN 0x200000 - -/* - * Make sure kernel is aligned to 2MB address. Catching it at compile - * time is better. Change your config file and compile the kernel - * for a 2MB aligned address (CONFIG_PHYSICAL_START) - */ -#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0 -#error "CONFIG_PHYSICAL_START must be a multiple of 2MB" -#endif +#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \ + (CONFIG_PHYSICAL_ALIGN - 1)) & \ + ~(CONFIG_PHYSICAL_ALIGN - 1)) #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) #define __START_KERNEL_map _AC(0xffffffff80000000, UL) -/* See Documentation/x86_64/mm.txt for a description of the memory map. */ +/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ #define __PHYSICAL_MASK_SHIFT 46 -#define __VIRTUAL_MASK_SHIFT 48 +#define __VIRTUAL_MASK_SHIFT 47 /* * Kernel image size is limited to 512 MB (see level2_kernel_pgt in @@ -71,12 +63,6 @@ extern unsigned long __phys_addr(unsigned long); #define vmemmap ((struct page *)VMEMMAP_START) -extern unsigned long init_memory_mapping(unsigned long start, - unsigned long end); - -extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); -extern void free_initmem(void); - extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 826ad37006a..6473f5ccff8 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -46,6 +46,12 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; +extern unsigned long init_memory_mapping(unsigned long start, + unsigned long end); + +extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); +extern void free_initmem(void); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PAGE_DEFS_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a53da004e08..4fb37c8a083 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -56,6 +56,7 @@ struct desc_ptr; struct tss_struct; struct mm_struct; struct desc_struct; +struct task_struct; /* * Wrapper type for pointers to code which uses the non-standard @@ -203,7 +204,8 @@ struct pv_cpu_ops { void (*swapgs)(void); - struct pv_lazy_ops lazy_mode; + void (*start_context_switch)(struct task_struct *prev); + void (*end_context_switch)(struct task_struct *next); }; struct pv_irq_ops { @@ -1399,25 +1401,23 @@ enum paravirt_lazy_mode { }; enum paravirt_lazy_mode paravirt_get_lazy_mode(void); -void paravirt_enter_lazy_cpu(void); -void paravirt_leave_lazy_cpu(void); +void paravirt_start_context_switch(struct task_struct *prev); +void paravirt_end_context_switch(struct task_struct *next); + void paravirt_enter_lazy_mmu(void); void paravirt_leave_lazy_mmu(void); -void paravirt_leave_lazy(enum paravirt_lazy_mode mode); -#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE -static inline void arch_enter_lazy_cpu_mode(void) +#define __HAVE_ARCH_START_CONTEXT_SWITCH +static inline void arch_start_context_switch(struct task_struct *prev) { - PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter); + PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev); } -static inline void arch_leave_lazy_cpu_mode(void) +static inline void arch_end_context_switch(struct task_struct *next) { - PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave); + PVOP_VCALL1(pv_cpu_ops.end_context_switch, next); } -void arch_flush_lazy_cpu_mode(void); - #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index b51a1e8b0ba..1ff685ca221 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -91,7 +91,7 @@ extern void pci_iommu_alloc(void); #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) -#if defined(CONFIG_X86_64) || defined(CONFIG_DMA_API_DEBUG) +#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG) #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ dma_addr_t ADDR_NAME; @@ -130,6 +130,7 @@ extern void pci_iommu_alloc(void); /* generic pci stuff */ #include <asm-generic/pci.h> +#define PCIBIOS_MAX_MEM_32 0xffffffff #ifdef CONFIG_NUMA /* Returns the node based on pci bus */ diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index e60fd3e14bd..b399988eee3 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -121,6 +121,9 @@ extern int __init pcibios_init(void); extern int __init pci_mmcfg_arch_init(void); extern void __init pci_mmcfg_arch_free(void); +extern struct acpi_mcfg_allocation *pci_mmcfg_config; +extern int pci_mmcfg_config_num; + /* * AMD Fam10h CPUs are buggy, and cannot access MMIO config space * on their northbrige except through the * %eax register. As such, you MUST diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 02ecb30982a..103f1ddb0d8 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -42,6 +42,7 @@ #else /* ...!ASSEMBLY */ +#include <linux/kernel.h> #include <linux/stringify.h> #ifdef CONFIG_SMP @@ -155,6 +156,15 @@ do { \ /* We can use this directly for local CPU (faster). */ DECLARE_PER_CPU(unsigned long, this_cpu_off); +#ifdef CONFIG_NEED_MULTIPLE_NODES +void *pcpu_lpage_remapped(void *kaddr); +#else +static inline void *pcpu_lpage_remapped(void *kaddr) +{ + return NULL; +} +#endif + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h new file mode 100644 index 00000000000..fa64e401589 --- /dev/null +++ b/arch/x86/include/asm/perf_counter.h @@ -0,0 +1,98 @@ +#ifndef _ASM_X86_PERF_COUNTER_H +#define _ASM_X86_PERF_COUNTER_H + +/* + * Performance counter hw details: + */ + +#define X86_PMC_MAX_GENERIC 8 +#define X86_PMC_MAX_FIXED 3 + +#define X86_PMC_IDX_GENERIC 0 +#define X86_PMC_IDX_FIXED 32 +#define X86_PMC_IDX_MAX 64 + +#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 +#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 + +#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 +#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 + +#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) +#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) +#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) + +/* + * Includes eventsel and unit mask as well: + */ +#define ARCH_PERFMON_EVENT_MASK 0xffff + +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ + (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) + +#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 + +/* + * Intel "Architectural Performance Monitoring" CPUID + * detection/enumeration details: + */ +union cpuid10_eax { + struct { + unsigned int version_id:8; + unsigned int num_counters:8; + unsigned int bit_width:8; + unsigned int mask_length:8; + } split; + unsigned int full; +}; + +union cpuid10_edx { + struct { + unsigned int num_counters_fixed:4; + unsigned int reserved:28; + } split; + unsigned int full; +}; + + +/* + * Fixed-purpose performance counters: + */ + +/* + * All 3 fixed-mode PMCs are configured via this single MSR: + */ +#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d + +/* + * The counts are available in three separate MSRs: + */ + +/* Instr_Retired.Any: */ +#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 +#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) + +/* CPU_CLK_Unhalted.Core: */ +#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a +#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) + +/* CPU_CLK_Unhalted.Ref: */ +#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b +#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) + +#ifdef CONFIG_PERF_COUNTERS +extern void init_hw_perf_counters(void); +extern void perf_counters_lapic_init(void); + +#define PERF_COUNTER_INDEX_OFFSET 0 + +#else +static inline void init_hw_perf_counters(void) { } +static inline void perf_counters_lapic_init(void) { } +#endif + +#endif /* _ASM_X86_PERF_COUNTER_H */ diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index dd14c54ac71..0e8c2a0fd92 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -46,7 +46,13 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte) __free_page(pte); } -extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte); +extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte); + +static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, + unsigned long address) +{ + ___pte_free_tlb(tlb, pte); +} static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) @@ -78,7 +84,13 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) free_page((unsigned long)pmd); } -extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); +extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); + +static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long adddress) +{ + ___pmd_free_tlb(tlb, pmd); +} #ifdef CONFIG_X86_PAE extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); @@ -108,7 +120,14 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) free_page((unsigned long)pud); } -extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); +extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); + +static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, + unsigned long address) +{ + ___pud_free_tlb(tlb, pud); +} + #endif /* PAGETABLE_LEVELS > 3 */ #endif /* PAGETABLE_LEVELS > 2 */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 29d96d168bc..16748077559 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -2,6 +2,7 @@ #define _ASM_X86_PGTABLE_H #include <asm/page.h> +#include <asm/e820.h> #include <asm/pgtable_types.h> @@ -81,6 +82,8 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base) #define pte_val(x) native_pte_val(x) #define __pte(x) native_make_pte(x) +#define arch_end_context_switch(prev) do {} while(0) + #endif /* CONFIG_PARAVIRT */ /* @@ -267,10 +270,17 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) -static inline int is_new_memtype_allowed(unsigned long flags, - unsigned long new_flags) +static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, + unsigned long flags, + unsigned long new_flags) { /* + * PAT type is always WB for ISA. So no need to check. + */ + if (is_ISA_range(paddr, paddr + size - 1)) + return 1; + + /* * Certain new memtypes are not allowed with certain * requested memtype: * - request is uncached, return cannot be write-back @@ -315,6 +325,11 @@ static inline int pte_present(pte_t a) return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } +static inline int pte_hidden(pte_t pte) +{ + return pte_flags(pte) & _PAGE_HIDDEN; +} + static inline int pmd_present(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_PRESENT; @@ -503,6 +518,8 @@ static inline int pgd_none(pgd_t pgd) #ifndef __ASSEMBLY__ +extern int direct_gbpages; + /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) { diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 31bd120cf2a..01fd9461d32 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -49,13 +49,17 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); #endif #if defined(CONFIG_HIGHPTE) +#define __KM_PTE \ + (in_nmi() ? KM_NMI_PTE : \ + in_irq() ? KM_IRQ_PTE : \ + KM_PTE0) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \ pte_index((address))) #define pte_offset_map_nested(dir, address) \ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ pte_index((address))) -#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0) +#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) #else #define pte_offset_map(dir, address) \ diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index 2733fad45f9..5e67c153231 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -46,6 +46,10 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ # define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) #endif +#define MODULES_VADDR VMALLOC_START +#define MODULES_END VMALLOC_END +#define MODULES_LEN (MODULES_VADDR - MODULES_END) + #define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE) #endif /* _ASM_X86_PGTABLE_32_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 6b87bc6d501..c57a3011714 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -25,10 +25,6 @@ extern pgd_t init_level4_pgt[]; extern void paging_init(void); -#endif /* !__ASSEMBLY__ */ - -#ifndef __ASSEMBLY__ - #define pte_ERROR(e) \ printk("%s:%d: bad pte %p(%016lx).\n", \ __FILE__, __LINE__, &(e), pte_val(e)) @@ -135,8 +131,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define update_mmu_cache(vma, address, pte) do { } while (0) -extern int direct_gbpages; - /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) @@ -171,10 +165,7 @@ extern void cleanup_highmap(void); /* fs/proc/kcore.c */ #define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK) -#define kc_offset_to_vaddr(o) \ - (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \ - ? ((o) | ~__VIRTUAL_MASK) \ - : (o)) +#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) #define __HAVE_ARCH_PTE_SAME #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index fbf42b8e038..766ea16fbbb 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -51,11 +51,11 @@ typedef struct { pteval_t pte; } pte_t; #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) - +/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) -#define VMALLOC_START _AC(0xffffc20000000000, UL) -#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) -#define VMEMMAP_START _AC(0xffffe20000000000, UL) +#define VMALLOC_START _AC(0xffffc90000000000, UL) +#define VMALLOC_END _AC(0xffffe8ffffffffff, UL) +#define VMEMMAP_START _AC(0xffffea0000000000, UL) #define MODULES_VADDR _AC(0xffffffffa0000000, UL) #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index b8238dc8786..54cb697f490 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -18,7 +18,7 @@ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ #define _PAGE_BIT_UNUSED1 9 /* available for programmer */ #define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ -#define _PAGE_BIT_UNUSED3 11 +#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 @@ -41,13 +41,18 @@ #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) -#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) #define __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_KMEMCHECK +#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) +#else +#define _PAGE_HIDDEN (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #else @@ -273,7 +278,6 @@ typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; extern int nx_enabled; -extern void set_nx(void); #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c2cceae709c..c7768269b1c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -135,7 +135,8 @@ extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 new_cpu_data; extern struct tss_struct doublefault_tss; -extern __u32 cleared_cpu_caps[NCAPINTS]; +extern __u32 cpu_caps_cleared[NCAPINTS]; +extern __u32 cpu_caps_set[NCAPINTS]; #ifdef CONFIG_SMP DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); @@ -409,9 +410,6 @@ DECLARE_PER_CPU(unsigned long, stack_canary); extern unsigned int xstate_size; extern void free_thread_xstate(struct task_struct *); extern struct kmem_cache *task_xstate_cachep; -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); -extern unsigned short num_cache_leaves; struct thread_struct { /* Cached TLS descriptors: */ @@ -427,8 +425,12 @@ struct thread_struct { unsigned short fsindex; unsigned short gsindex; #endif +#ifdef CONFIG_X86_32 unsigned long ip; +#endif +#ifdef CONFIG_X86_64 unsigned long fs; +#endif unsigned long gs; /* Hardware debugging registers: */ unsigned long debugreg0; @@ -460,14 +462,8 @@ struct thread_struct { unsigned io_bitmap_max; /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */ unsigned long debugctlmsr; -#ifdef CONFIG_X86_DS -/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */ + /* Debug Store context; see asm/ds.h */ struct ds_context *ds_ctx; -#endif /* CONFIG_X86_DS */ -#ifdef CONFIG_X86_PTRACE_BTS -/* the signal to send on a bts buffer overflow */ - unsigned int bts_ovfl_signal; -#endif /* CONFIG_X86_PTRACE_BTS */ }; static inline unsigned long native_get_debugreg(int regno) @@ -795,6 +791,21 @@ static inline unsigned long get_debugctlmsr(void) return debugctlmsr; } +static inline unsigned long get_debugctlmsr_on_cpu(int cpu) +{ + u64 debugctlmsr = 0; + u32 val1, val2; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2); + debugctlmsr = val1 | ((u64)val2 << 32); + + return debugctlmsr; +} + static inline void update_debugctlmsr(unsigned long debugctlmsr) { #ifndef CONFIG_X86_DEBUGCTLMSR @@ -804,6 +815,18 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr) wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); } +static inline void update_debugctlmsr_on_cpu(int cpu, + unsigned long debugctlmsr) +{ +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return; +#endif + wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, + (u32)((u64)debugctlmsr), + (u32)((u64)debugctlmsr >> 32)); +} + /* * from system description table in BIOS. Mostly for MCA use, but * others may find it useful: @@ -814,6 +837,7 @@ extern unsigned int BIOS_revision; /* Boot loader type from the setup header: */ extern int bootloader_type; +extern int bootloader_version; extern char ignore_fpu_irq; @@ -874,7 +898,6 @@ static inline void spin_lock_prefetch(const void *x) .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ - .fs = __KERNEL_PERCPU, \ } /* diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 49fb3ecf3bb..621f56d7312 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -22,7 +22,14 @@ extern int reboot_force; long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); -#define round_up(x, y) (((x) + (y) - 1) & ~((y) - 1)) -#define round_down(x, y) ((x) & ~((y) - 1)) +/* + * This looks more complex than it should be. But we need to + * get the type for the ~ right in round_down (it needs to be + * as wide as the result!), and we want to evaluate the macro + * arguments just once each. + */ +#define __round_mask(x,y) ((__typeof__(x))((y)-1)) +#define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1) +#define round_down(x,y) ((x) & ~__round_mask(x,y)) #endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 624f133943e..0f0d908349a 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -236,12 +236,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx, extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); -extern void x86_ptrace_untrace(struct task_struct *); -extern void x86_ptrace_fork(struct task_struct *child, - unsigned long clone_flags); +#ifdef CONFIG_X86_PTRACE_BTS +extern void ptrace_bts_untrace(struct task_struct *tsk); -#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk) -#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags) +#define arch_ptrace_untrace(tsk) ptrace_bts_untrace(tsk) +#endif /* CONFIG_X86_PTRACE_BTS */ #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index a4737dddfd5..64cf2d24fad 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -48,9 +48,15 @@ #endif #ifdef CONFIG_X86_64 +#ifdef CONFIG_PARAVIRT +/* Paravirtualized systems may not have PSE or PGE available */ #define NEED_PSE 0 -#define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) #define NEED_PGE 0 +#else +#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31) +#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31) +#endif +#define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) #define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) #define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) #define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index bdc2ada05ae..4093d1ed6db 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -33,7 +33,6 @@ struct x86_quirks { int (*setup_ioapic_ids)(void); }; -extern void x86_quirk_pre_intr_init(void); extern void x86_quirk_intr_init(void); extern void x86_quirk_trap_init(void); diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 7761a5d554b..598457cbd0f 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -117,7 +117,7 @@ typedef unsigned long sigset_t; #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 -#include <asm-generic/signal.h> +#include <asm-generic/signal-defs.h> #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 19e0d88b966..6a84ed166ae 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -180,7 +180,7 @@ extern int safe_smp_processor_id(void); static inline int logical_smp_processor_id(void) { /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); + return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); } #endif diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index e3cc3c063ec..4517d6b9318 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -27,7 +27,7 @@ #else /* CONFIG_X86_32 */ # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ # define MAX_PHYSADDR_BITS 44 -# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */ +# define MAX_PHYSMEM_BITS 46 #endif #endif /* CONFIG_SPARSEMEM */ diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index b7e5db87639..4e77853321d 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -302,4 +302,8 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw) #define _raw_read_relax(lock) cpu_relax() #define _raw_write_relax(lock) cpu_relax() +/* The {read|write|spin}_lock() on x86 are full memory barriers. */ +static inline void smp_mb__after_lock(void) { } +#define ARCH_HAS_SMP_MB_AFTER_LOCK + #endif /* _ASM_X86_SPINLOCK_H */ diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index f517944b2b1..cf86a5e7381 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -3,6 +3,8 @@ extern int kstack_depth_to_print; +int x86_is_stack_id(int id, char *name); + /* Generic stack tracer with callbacks */ struct stacktrace_ops { diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 0e0e3ba827f..c86f452256d 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -177,10 +177,18 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) * No 3D Now! */ +#ifndef CONFIG_KMEMCHECK #define memcpy(t, f, n) \ (__builtin_constant_p((n)) \ ? __constant_memcpy((t), (f), (n)) \ : __memcpy((t), (f), (n))) +#else +/* + * kmemcheck becomes very happy if we use the REP instructions unconditionally, + * because it means that we know both memory operands in advance. + */ +#define memcpy(t, f, n) __memcpy((t), (f), (n)) +#endif #endif diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 2afe164bf1e..19e2c468fc2 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -27,6 +27,7 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t function. */ #define __HAVE_ARCH_MEMCPY 1 +#ifndef CONFIG_KMEMCHECK #if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 extern void *memcpy(void *to, const void *from, size_t len); #else @@ -42,6 +43,13 @@ extern void *__memcpy(void *to, const void *from, size_t len); __ret; \ }) #endif +#else +/* + * kmemcheck becomes very happy if we use the REP instructions unconditionally, + * because it means that we know both memory operands in advance. + */ +#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len)) +#endif #define __HAVE_ARCH_MEMSET void *memset(void *s, int c, size_t n); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 82ada75f3eb..85574b7c1bc 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -225,6 +225,7 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_EVTINJ_VALID_ERR (1 << 11) #define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK +#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK #define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR #define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 7043408f690..372b76edd63 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -1,7 +1,7 @@ /* * syscalls.h - Linux syscall interfaces (arch-specific) * - * Copyright (c) 2008 Jaswinder Singh + * Copyright (c) 2008 Jaswinder Singh Rajput * * This file is released under the GPLv2. * See the file COPYING for more details. @@ -12,50 +12,55 @@ #include <linux/compiler.h> #include <linux/linkage.h> -#include <linux/types.h> #include <linux/signal.h> +#include <linux/types.h> /* Common in X86_32 and X86_64 */ /* kernel/ioport.c */ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); +/* kernel/process.c */ +int sys_fork(struct pt_regs *); +int sys_vfork(struct pt_regs *); + /* kernel/ldt.c */ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); +/* kernel/signal.c */ +long sys_rt_sigreturn(struct pt_regs *); + /* kernel/tls.c */ asmlinkage int sys_set_thread_area(struct user_desc __user *); asmlinkage int sys_get_thread_area(struct user_desc __user *); /* X86_32 only */ #ifdef CONFIG_X86_32 +/* kernel/ioport.c */ +long sys_iopl(struct pt_regs *); + /* kernel/process_32.c */ -int sys_fork(struct pt_regs *); int sys_clone(struct pt_regs *); -int sys_vfork(struct pt_regs *); int sys_execve(struct pt_regs *); -/* kernel/signal_32.c */ +/* kernel/signal.c */ asmlinkage int sys_sigsuspend(int, int, old_sigset_t); asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, struct old_sigaction __user *); int sys_sigaltstack(struct pt_regs *); unsigned long sys_sigreturn(struct pt_regs *); -long sys_rt_sigreturn(struct pt_regs *); - -/* kernel/ioport.c */ -long sys_iopl(struct pt_regs *); /* kernel/sys_i386_32.c */ +struct mmap_arg_struct; +struct sel_arg_struct; +struct oldold_utsname; +struct old_utsname; + asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -struct mmap_arg_struct; asmlinkage int old_mmap(struct mmap_arg_struct __user *); -struct sel_arg_struct; asmlinkage int old_select(struct sel_arg_struct __user *); asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); -struct old_utsname; asmlinkage int sys_uname(struct old_utsname __user *); -struct oldold_utsname; asmlinkage int sys_olduname(struct oldold_utsname __user *); /* kernel/vm86_32.c */ @@ -65,29 +70,27 @@ int sys_vm86(struct pt_regs *); #else /* CONFIG_X86_32 */ /* X86_64 only */ +/* kernel/ioport.c */ +asmlinkage long sys_iopl(unsigned int, struct pt_regs *); + /* kernel/process_64.c */ -asmlinkage long sys_fork(struct pt_regs *); asmlinkage long sys_clone(unsigned long, unsigned long, void __user *, void __user *, struct pt_regs *); -asmlinkage long sys_vfork(struct pt_regs *); asmlinkage long sys_execve(char __user *, char __user * __user *, char __user * __user *, struct pt_regs *); long sys_arch_prctl(int, unsigned long); -/* kernel/ioport.c */ -asmlinkage long sys_iopl(unsigned int, struct pt_regs *); - -/* kernel/signal_64.c */ +/* kernel/signal.c */ asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *, struct pt_regs *); -long sys_rt_sigreturn(struct pt_regs *); /* kernel/sys_x86_64.c */ +struct new_utsname; + asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -struct new_utsname; asmlinkage long sys_uname(struct new_utsname __user *); #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h index f72956331c4..c4ee8056bac 100644 --- a/arch/x86/include/asm/termios.h +++ b/arch/x86/include/asm/termios.h @@ -67,6 +67,7 @@ static inline int user_termio_to_kernel_termios(struct ktermios *termios, SET_LOW_TERMIOS_BITS(termios, termio, c_oflag); SET_LOW_TERMIOS_BITS(termios, termio, c_cflag); SET_LOW_TERMIOS_BITS(termios, termio, c_lflag); + get_user(termios->c_line, &termio->c_line); return copy_from_user(termios->c_cc, termio->c_cc, NCC); } diff --git a/arch/x86/include/asm/therm_throt.h b/arch/x86/include/asm/therm_throt.h deleted file mode 100644 index c62349ee786..00000000000 --- a/arch/x86/include/asm/therm_throt.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef _ASM_X86_THERM_THROT_H -#define _ASM_X86_THERM_THROT_H - -#include <asm/atomic.h> - -extern atomic_t therm_throt_en; -int therm_throt_process(int curr); - -#endif /* _ASM_X86_THERM_THROT_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8820a73ae09..fad7d40b75f 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -49,7 +49,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ @@ -94,7 +94,8 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ -#define TIF_SYSCALL_FTRACE 27 /* for ftrace syscall instrumentation */ +#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ +#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -116,6 +117,7 @@ struct thread_info { #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) +#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) #define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) /* work to do in syscall_trace_enter() */ @@ -152,9 +154,9 @@ struct thread_info { /* thread information allocation */ #ifdef CONFIG_DEBUG_STACK_USAGE -#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) +#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) #else -#define THREAD_FLAGS GFP_KERNEL +#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK) #endif #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index bd37ed444a2..20ca9c4d468 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -45,12 +45,16 @@ extern int no_timer_check; */ DECLARE_PER_CPU(unsigned long, cyc2ns); +DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ static inline unsigned long long __cycles_2_ns(unsigned long long cyc) { - return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR; + int cpu = smp_processor_id(); + unsigned long long ns = per_cpu(cyc2ns_offset, cpu); + ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR; + return ns; } static inline unsigned long long cycles_2_ns(unsigned long long cyc) diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h index b5c9d45c981..1375cfc9396 100644 --- a/arch/x86/include/asm/timex.h +++ b/arch/x86/include/asm/timex.h @@ -4,9 +4,7 @@ #include <asm/processor.h> #include <asm/tsc.h> -/* The PIT ticks at this frequency (in HZ): */ -#define PIT_TICK_RATE 1193182 - +/* Assume we use the PIT time source for the clock tick */ #define CLOCK_TICK_RATE PIT_TICK_RATE #define ARCH_HAS_READ_CURRENT_TIMER diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 16a5c84b032..7f3eba08e7d 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -17,7 +17,7 @@ static inline void __native_flush_tlb(void) { - write_cr3(read_cr3()); + native_write_cr3(native_read_cr3()); } static inline void __native_flush_tlb_global(void) @@ -32,11 +32,11 @@ static inline void __native_flush_tlb_global(void) */ raw_local_irq_save(flags); - cr4 = read_cr4(); + cr4 = native_read_cr4(); /* clear PGE */ - write_cr4(cr4 & ~X86_CR4_PGE); + native_write_cr4(cr4 & ~X86_CR4_PGE); /* write old PGE again and flush TLBs */ - write_cr4(cr4); + native_write_cr4(cr4); raw_local_irq_restore(flags); } @@ -172,6 +172,6 @@ static inline void flush_tlb_kernel_range(unsigned long start, flush_tlb_all(); } -extern void zap_low_mappings(void); +extern void zap_low_mappings(bool early); #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index f44b49abca4..066ef590d7e 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -203,7 +203,8 @@ struct pci_bus; void x86_pci_root_bus_res_quirks(struct pci_bus *b); #ifdef CONFIG_SMP -#define mc_capable() (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids) +#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \ + (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids)) #define smt_capable() (smp_num_siblings > 1) #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 0d5342515b8..bfd74c032fc 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -2,6 +2,7 @@ #define _ASM_X86_TRAPS_H #include <asm/debugreg.h> +#include <asm/siginfo.h> /* TRAP_TRACE, ... */ #ifdef CONFIG_X86_32 #define dotraplinkage @@ -13,6 +14,9 @@ asmlinkage void divide_error(void); asmlinkage void debug(void); asmlinkage void nmi(void); asmlinkage void int3(void); +asmlinkage void xen_debug(void); +asmlinkage void xen_int3(void); +asmlinkage void xen_stack_segment(void); asmlinkage void overflow(void); asmlinkage void bounds(void); asmlinkage void invalid_op(void); @@ -74,7 +78,6 @@ static inline int get_si_code(unsigned long condition) } extern int panic_on_unrecovered_nmi; -extern int kstack_depth_to_print; void math_error(void __user *); void math_emulate(struct math_emu_info *); diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h index e6f73632007..09b97745772 100644 --- a/arch/x86/include/asm/types.h +++ b/arch/x86/include/asm/types.h @@ -14,12 +14,6 @@ typedef unsigned short umode_t; */ #ifdef __KERNEL__ -#ifdef CONFIG_X86_32 -# define BITS_PER_LONG 32 -#else -# define BITS_PER_LONG 64 -#endif - #ifndef __ASSEMBLY__ typedef u64 dma64_addr_t; diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index b685ece89d5..d2c6c930b49 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -25,7 +25,7 @@ #define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) #define KERNEL_DS MAKE_MM_SEG(-1UL) -#define USER_DS MAKE_MM_SEG(PAGE_OFFSET) +#define USER_DS MAKE_MM_SEG(TASK_SIZE_MAX) #define get_ds() (KERNEL_DS) #define get_fs() (current_thread_info()->addr_limit) @@ -212,9 +212,9 @@ extern int __get_user_bad(void); : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") #else #define __put_user_asm_u64(x, ptr, retval, errret) \ - __put_user_asm(x, ptr, retval, "q", "", "Zr", errret) + __put_user_asm(x, ptr, retval, "q", "", "er", errret) #define __put_user_asm_ex_u64(x, addr) \ - __put_user_asm_ex(x, addr, "q", "", "Zr") + __put_user_asm_ex(x, addr, "q", "", "er") #define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) #endif diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 8cc687326eb..db24b215fc5 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -88,11 +88,11 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size) ret, "l", "k", "ir", 4); return ret; case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, - ret, "q", "", "ir", 8); + ret, "q", "", "er", 8); return ret; case 10: __put_user_asm(*(u64 *)src, (u64 __user *)dst, - ret, "q", "", "ir", 10); + ret, "q", "", "er", 10); if (unlikely(ret)) return ret; asm("":::"memory"); @@ -101,12 +101,12 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size) return ret; case 16: __put_user_asm(*(u64 *)src, (u64 __user *)dst, - ret, "q", "", "ir", 16); + ret, "q", "", "er", 16); if (unlikely(ret)) return ret; asm("":::"memory"); __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, - ret, "q", "", "ir", 8); + ret, "q", "", "er", 8); return ret; default: return copy_user_generic((__force void *)dst, src, size); @@ -157,7 +157,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) ret, "q", "", "=r", 8); if (likely(!ret)) __put_user_asm(tmp, (u64 __user *)dst, - ret, "q", "", "ir", 8); + ret, "q", "", "er", 8); return ret; } default: diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 6e72d74cf8d..732a3070615 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -340,6 +340,8 @@ #define __NR_inotify_init1 332 #define __NR_preadv 333 #define __NR_pwritev 334 +#define __NR_rt_tgsigqueueinfo 335 +#define __NR_perf_counter_open 336 #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index f8182946232..900e1617e67 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -657,7 +657,10 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1) __SYSCALL(__NR_preadv, sys_preadv) #define __NR_pwritev 296 __SYSCALL(__NR_pwritev, sys_pwritev) - +#define __NR_rt_tgsigqueueinfo 297 +__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) +#define __NR_perf_counter_open 298 +__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 9b0e61bf7a8..80e2984f521 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -37,7 +37,7 @@ #define UV_CPUS_PER_ACT_STATUS 32 #define UV_ACT_STATUS_MASK 0x3 #define UV_ACT_STATUS_SIZE 2 -#define UV_ACTIVATION_DESCRIPTOR_SIZE 32 +#define UV_ADP_SIZE 32 #define UV_DISTRIBUTION_SIZE 256 #define UV_SW_ACK_NPENDING 8 #define UV_NET_ENDPOINT_INTD 0x38 @@ -133,7 +133,7 @@ struct bau_msg_payload { * see table 4.2.3.0.1 in broacast_assist spec. */ struct bau_msg_header { - unsigned int dest_subnodeid:6; /* must be zero */ + unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ /* bits 5:0 */ unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */ /* bits 20:6 */ /* first bit in node_map */ diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index d3a98ea1062..77a68505419 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -133,6 +133,7 @@ struct uv_scir_s { struct uv_hub_info_s { unsigned long global_mmr_base; unsigned long gpa_mask; + unsigned int gnode_extra; unsigned long gnode_upper; unsigned long lowmem_remap_top; unsigned long lowmem_remap_base; @@ -159,7 +160,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); * p - PNODE (local part of nsids, right shifted 1) */ #define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) -#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper) +#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) +#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1) #define UV_LOCAL_MMR_BASE 0xf4000000UL #define UV_GLOBAL_MMR32_BASE 0xf8000000UL @@ -173,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) #define UV_GLOBAL_MMR64_PNODE_BITS(p) \ - ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT) + (((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) #define UV_APIC_PNODE_SHIFT 6 @@ -325,6 +327,7 @@ struct uv_blade_info { unsigned short nr_possible_cpus; unsigned short nr_online_cpus; unsigned short pnode; + short memory_nid; }; extern struct uv_blade_info *uv_blade_info; extern short *uv_node_to_blade; @@ -361,6 +364,12 @@ static inline int uv_blade_to_pnode(int bid) return uv_blade_info[bid].pnode; } +/* Nid of memory node on blade. -1 if no blade-local memory */ +static inline int uv_blade_to_memory_nid(int bid) +{ + return uv_blade_info[bid].memory_nid; +} + /* Determine the number of possible cpus on a blade */ static inline int uv_blade_nr_possible_cpus(int bid) { diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 498f944010b..11be5ad2e0e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -247,6 +247,7 @@ enum vmcs_field { #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MCE_DURING_VMENTRY 41 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 #define EXIT_REASON_EPT_VIOLATION 48 diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index 11b3bb86e17..7fcf6f3dbcc 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h @@ -1,5 +1,10 @@ +#ifdef CONFIG_KMEMCHECK +/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ +# include <asm-generic/xor.h> +#else #ifdef CONFIG_X86_32 # include "xor_32.h" #else # include "xor_64.h" #endif +#endif diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 88d1bfc847d..430d5b24af7 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -24,11 +24,15 @@ CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) CFLAGS_hpet.o := $(nostackp) CFLAGS_tsc.o := $(nostackp) CFLAGS_paravirt.o := $(nostackp) +GCOV_PROFILE_vsyscall_64.o := n +GCOV_PROFILE_hpet.o := n +GCOV_PROFILE_tsc.o := n +GCOV_PROFILE_paravirt.o := n obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o -obj-y += setup.o i8259.o irqinit_$(BITS).o +obj-y += setup.o i8259.o irqinit.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o @@ -44,6 +48,7 @@ obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o obj-$(CONFIG_X86_DS) += ds.o +obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o @@ -72,7 +77,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_MODULES) += module_$(BITS).o +obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 723989d7f80..6b8ca3a0285 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -33,6 +33,7 @@ #include <linux/irq.h> #include <linux/bootmem.h> #include <linux/ioport.h> +#include <linux/pci.h> #include <asm/pgtable.h> #include <asm/io_apic.h> @@ -43,11 +44,7 @@ static int __initdata acpi_force = 0; u32 acpi_rsdt_forced; -#ifdef CONFIG_ACPI -int acpi_disabled = 0; -#else -int acpi_disabled = 1; -#endif +int acpi_disabled; EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 @@ -121,72 +118,6 @@ void __init __acpi_unmap_table(char *map, unsigned long size) early_iounmap(map, size); } -#ifdef CONFIG_PCI_MMCONFIG - -static int acpi_mcfg_64bit_base_addr __initdata = FALSE; - -/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ -struct acpi_mcfg_allocation *pci_mmcfg_config; -int pci_mmcfg_config_num; - -static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) -{ - if (!strcmp(mcfg->header.oem_id, "SGI")) - acpi_mcfg_64bit_base_addr = TRUE; - - return 0; -} - -int __init acpi_parse_mcfg(struct acpi_table_header *header) -{ - struct acpi_table_mcfg *mcfg; - unsigned long i; - int config_size; - - if (!header) - return -EINVAL; - - mcfg = (struct acpi_table_mcfg *)header; - - /* how many config structures do we have */ - pci_mmcfg_config_num = 0; - i = header->length - sizeof(struct acpi_table_mcfg); - while (i >= sizeof(struct acpi_mcfg_allocation)) { - ++pci_mmcfg_config_num; - i -= sizeof(struct acpi_mcfg_allocation); - }; - if (pci_mmcfg_config_num == 0) { - printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); - return -ENODEV; - } - - config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); - pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); - if (!pci_mmcfg_config) { - printk(KERN_WARNING PREFIX - "No memory for MCFG config tables\n"); - return -ENOMEM; - } - - memcpy(pci_mmcfg_config, &mcfg[1], config_size); - - acpi_mcfg_oem_check(mcfg); - - for (i = 0; i < pci_mmcfg_config_num; ++i) { - if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) && - !acpi_mcfg_64bit_base_addr) { - printk(KERN_ERR PREFIX - "MMCONFIG not in low 4GB of memory\n"); - kfree(pci_mmcfg_config); - pci_mmcfg_config_num = 0; - return -ENODEV; - } - } - - return 0; -} -#endif /* CONFIG_PCI_MMCONFIG */ - #ifdef CONFIG_X86_LOCAL_APIC static int __init acpi_parse_madt(struct acpi_table_header *table) { @@ -522,7 +453,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) * success: return IRQ number (>=0) * failure: return < 0 */ -int acpi_register_gsi(u32 gsi, int triggering, int polarity) +int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { unsigned int irq; unsigned int plat_gsi = gsi; @@ -532,14 +463,14 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity) * Make sure all (legacy) PCI IRQs are set as level-triggered. */ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { - if (triggering == ACPI_LEVEL_SENSITIVE) + if (trigger == ACPI_LEVEL_SENSITIVE) eisa_set_level_irq(gsi); } #endif #ifdef CONFIG_X86_IO_APIC if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { - plat_gsi = mp_register_gsi(gsi, triggering, polarity); + plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); } #endif acpi_gsi_to_irq(plat_gsi, &irq); @@ -903,10 +834,8 @@ extern int es7000_plat; #endif static struct { - int apic_id; int gsi_base; int gsi_end; - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); } mp_ioapic_routing[MAX_IO_APICS]; int mp_find_ioapic(int gsi) @@ -986,16 +915,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); mp_ioapics[idx].apicid = uniq_ioapic_id(id); -#ifdef CONFIG_X86_32 mp_ioapics[idx].apicver = io_apic_get_version(idx); -#else - mp_ioapics[idx].apicver = 0; -#endif + /* * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). */ - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid; mp_ioapic_routing[idx].gsi_base = gsi_base; mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); @@ -1158,26 +1083,52 @@ void __init mp_config_acpi_legacy_irqs(void) } } -int mp_register_gsi(u32 gsi, int triggering, int polarity) +static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, + int polarity) { +#ifdef CONFIG_X86_MPPARSE + struct mpc_intsrc mp_irq; + struct pci_dev *pdev; + unsigned char number; + unsigned int devfn; int ioapic; - int ioapic_pin; -#ifdef CONFIG_X86_32 -#define MAX_GSI_NUM 4096 -#define IRQ_COMPRESSION_START 64 + u8 pin; - static int pci_irq = IRQ_COMPRESSION_START; - /* - * Mapping between Global System Interrupts, which - * represent all possible interrupts, and IRQs - * assigned to actual devices. - */ - static int gsi_to_irq[MAX_GSI_NUM]; -#else + if (!acpi_ioapic) + return 0; + if (!dev) + return 0; + if (dev->bus != &pci_bus_type) + return 0; + + pdev = to_pci_dev(dev); + number = pdev->bus->number; + devfn = pdev->devfn; + pin = pdev->pin; + /* print the entry should happen on mptable identically */ + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); + mp_irq.srcbus = number; + mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); + ioapic = mp_find_ioapic(gsi); + mp_irq.dstapic = mp_ioapics[ioapic].apicid; + mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); + + save_mp_irq(&mp_irq); +#endif + return 0; +} + +int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) +{ + int ioapic; + int ioapic_pin; + struct io_apic_irq_attr irq_attr; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; -#endif /* Don't set up the ACPI SCI because it's already set up */ if (acpi_gbl_FADT.sci_interrupt == gsi) @@ -1196,93 +1147,22 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) gsi = ioapic_renumber_irq(ioapic, gsi); #endif - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ if (ioapic_pin > MP_MAX_IOAPIC_PIN) { printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + "%d-%d\n", mp_ioapics[ioapic].apicid, ioapic_pin); return gsi; } - if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); -#ifdef CONFIG_X86_32 - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); -#else - return gsi; -#endif - } - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); -#ifdef CONFIG_X86_32 - /* - * For GSI >= 64, use IRQ compression - */ - if ((gsi >= IRQ_COMPRESSION_START) - && (triggering == ACPI_LEVEL_SENSITIVE)) { - /* - * For PCI devices assign IRQs in order, avoiding gaps - * due to unused I/O APIC pins. - */ - int irq = gsi; - if (gsi < MAX_GSI_NUM) { - /* - * Retain the VIA chipset work-around (gsi > 15), but - * avoid a problem where the 8254 timer (IRQ0) is setup - * via an override (so it's not on pin 0 of the ioapic), - * and at the same time, the pin 0 interrupt is a PCI - * type. The gsi > 15 test could cause these two pins - * to be shared as IRQ0, and they are not shareable. - * So test for this condition, and if necessary, avoid - * the pin collision. - */ - gsi = pci_irq++; - /* - * Don't assign IRQ used by ACPI SCI - */ - if (gsi == acpi_gbl_FADT.sci_interrupt) - gsi = pci_irq++; - gsi_to_irq[irq] = gsi; - } else { - printk(KERN_ERR "GSI %u is too high\n", gsi); - return gsi; - } - } -#endif - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, - polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - return gsi; -} + if (enable_update_mptable) + mp_config_acpi_gsi(dev, gsi, trigger, polarity); -int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, - u32 gsi, int triggering, int polarity) -{ -#ifdef CONFIG_X86_MPPARSE - struct mpc_intsrc mp_irq; - int ioapic; - - if (!acpi_ioapic) - return 0; + set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, + trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, + polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + io_apic_set_pci_routing(dev, gsi, &irq_attr); - /* print the entry should happen on mptable identically */ - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | - (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); - mp_irq.srcbus = number; - mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); - ioapic = mp_find_ioapic(gsi); - mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id; - mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); - - save_mp_irq(&mp_irq); -#endif - return 0; + return gsi; } /* @@ -1569,14 +1449,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { }, { .callback = force_acpi_ht, - .ident = "ASUS P4B266", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - DMI_MATCH(DMI_BOARD_NAME, "P4B266"), - }, - }, - { - .callback = force_acpi_ht, .ident = "ASUS P2B-DS", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index bbbe4bbb6f3..8c44c232efc 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -34,12 +34,22 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, flags->bm_check = 1; else if (c->x86_vendor == X86_VENDOR_INTEL) { /* - * Today all CPUs that support C3 share cache. - * TBD: This needs to look at cache shared map, once - * multi-core detection patch makes to the base. + * Today all MP CPUs that support C3 share cache. + * And caches should not be flushed by software while + * entering C3 type state. */ flags->bm_check = 1; } + + /* + * On all recent Intel platforms, ARB_DISABLE is a nop. + * So, set bm_control to zero to indicate that ARB_DISABLE + * is not required while entering C3 type state on + * P4, Core and beyond CPUs + */ + if (c->x86_vendor == X86_VENDOR_INTEL && + (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14))) + flags->bm_control = 0; } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index 7c074eec39f..d296f4a195c 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -72,6 +72,7 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) return; } + /* Initialize _PDC data based on the CPU vendor */ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) { @@ -85,3 +86,15 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) } EXPORT_SYMBOL(arch_acpi_processor_init_pdc); + +void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr) +{ + if (pr->pdc) { + kfree(pr->pdc->pointer->buffer.pointer); + kfree(pr->pdc->pointer); + kfree(pr->pdc); + pr->pdc = NULL; + } +} + +EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc); diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile index 1c31cc0e9de..6a564ac67ef 100644 --- a/arch/x86/kernel/acpi/realmode/Makefile +++ b/arch/x86/kernel/acpi/realmode/Makefile @@ -9,7 +9,7 @@ always := wakeup.bin targets := wakeup.elf wakeup.lds -wakeup-y += wakeup.o wakemain.o video-mode.o copy.o +wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o # The link order of the video-*.o modules can matter. In particular, # video-vga.o *must* be listed first, followed by video-vesa.o. @@ -42,6 +42,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \ $(call cc-option, -mpreferred-stack-boundary=2) KBUILD_CFLAGS += $(call cc-option, -m32) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ +GCOV_PROFILE := n WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y)) diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S new file mode 100644 index 00000000000..f51eb0bb56c --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/bioscall.S @@ -0,0 +1 @@ +#include "../../../boot/bioscall.S" diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c new file mode 100644 index 00000000000..6206033ba20 --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/regs.c @@ -0,0 +1 @@ +#include "../../../boot/regs.c" diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 7c243a2c511..ca93638ba43 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -104,7 +104,7 @@ int acpi_save_state_mem(void) initial_gs = per_cpu_offset(smp_processor_id()); #endif initial_code = (unsigned long)wakeup_long64; - saved_magic = 0x123456789abcdef0; + saved_magic = 0x123456789abcdef0L; #endif /* CONFIG_64BIT */ return 0; diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99dad5..6c99f503780 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -55,7 +55,16 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); static struct dma_ops_domain *find_protection_domain(u16 devid); +static u64* alloc_pte(struct protection_domain *dom, + unsigned long address, u64 + **pte_page, gfp_t gfp); +static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, + unsigned long start_page, + unsigned int pages); +#ifndef BUS_NOTIFY_UNBOUND_DRIVER +#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 +#endif #ifdef CONFIG_AMD_IOMMU_STATS @@ -213,7 +222,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) { struct amd_iommu *iommu; - list_for_each_entry(iommu, &amd_iommu_list, list) + for_each_iommu(iommu) iommu_poll_events(iommu); return IRQ_HANDLED; @@ -425,6 +434,16 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); } +/* Flush the whole IO/TLB for a given protection domain - including PDE */ +static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) +{ + u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + + INC_STATS_COUNTER(domain_flush_single); + + iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1); +} + /* * This function is used to flush the IO/TLB for a given protection domain * on every IOMMU in the system @@ -440,7 +459,7 @@ static void iommu_flush_domain(u16 domid) __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, domid, 1, 1); - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { spin_lock_irqsave(&iommu->lock, flags); __iommu_queue_command(iommu, &cmd); __iommu_completion_wait(iommu); @@ -449,6 +468,35 @@ static void iommu_flush_domain(u16 domid) } } +void amd_iommu_flush_all_domains(void) +{ + int i; + + for (i = 1; i < MAX_DOMAIN_ID; ++i) { + if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) + continue; + iommu_flush_domain(i); + } +} + +void amd_iommu_flush_all_devices(void) +{ + struct amd_iommu *iommu; + int i; + + for (i = 0; i <= amd_iommu_last_bdf; ++i) { + if (amd_iommu_pd_table[i] == NULL) + continue; + + iommu = amd_iommu_rlookup_table[i]; + if (!iommu) + continue; + + iommu_queue_inv_dev_entry(iommu, i); + iommu_completion_wait(iommu); + } +} + /**************************************************************************** * * The functions below are used the create the page table mappings for @@ -468,7 +516,7 @@ static int iommu_map_page(struct protection_domain *dom, unsigned long phys_addr, int prot) { - u64 __pte, *pte, *page; + u64 __pte, *pte; bus_addr = PAGE_ALIGN(bus_addr); phys_addr = PAGE_ALIGN(phys_addr); @@ -477,27 +525,7 @@ static int iommu_map_page(struct protection_domain *dom, if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) return -EINVAL; - pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - *pte = IOMMU_L2_PDE(virt_to_phys(page)); - } - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - *pte = IOMMU_L1_PDE(virt_to_phys(page)); - } - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)]; + pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL); if (IOMMU_PTE_PRESENT(*pte)) return -EBUSY; @@ -595,7 +623,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, * as allocated in the aperture */ if (addr < dma_dom->aperture_size) - __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); + __set_bit(addr >> PAGE_SHIFT, + dma_dom->aperture[0]->bitmap); } return 0; @@ -632,42 +661,191 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, ****************************************************************************/ /* - * The address allocator core function. + * The address allocator core functions. * * called with domain->lock held */ + +/* + * This function checks if there is a PTE for a given dma address. If + * there is one, it returns the pointer to it. + */ +static u64* fetch_pte(struct protection_domain *domain, + unsigned long address) +{ + u64 *pte; + + pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + + return pte; +} + +/* + * This function is used to add a new aperture range to an existing + * aperture in case of dma_ops domain allocation or address allocation + * failure. + */ +static int alloc_new_range(struct amd_iommu *iommu, + struct dma_ops_domain *dma_dom, + bool populate, gfp_t gfp) +{ + int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; + int i; + +#ifdef CONFIG_IOMMU_STRESS + populate = false; +#endif + + if (index >= APERTURE_MAX_RANGES) + return -ENOMEM; + + dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp); + if (!dma_dom->aperture[index]) + return -ENOMEM; + + dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp); + if (!dma_dom->aperture[index]->bitmap) + goto out_free; + + dma_dom->aperture[index]->offset = dma_dom->aperture_size; + + if (populate) { + unsigned long address = dma_dom->aperture_size; + int i, num_ptes = APERTURE_RANGE_PAGES / 512; + u64 *pte, *pte_page; + + for (i = 0; i < num_ptes; ++i) { + pte = alloc_pte(&dma_dom->domain, address, + &pte_page, gfp); + if (!pte) + goto out_free; + + dma_dom->aperture[index]->pte_pages[i] = pte_page; + + address += APERTURE_RANGE_SIZE / 64; + } + } + + dma_dom->aperture_size += APERTURE_RANGE_SIZE; + + /* Intialize the exclusion range if necessary */ + if (iommu->exclusion_start && + iommu->exclusion_start >= dma_dom->aperture[index]->offset && + iommu->exclusion_start < dma_dom->aperture_size) { + unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length, + PAGE_SIZE); + dma_ops_reserve_addresses(dma_dom, startpage, pages); + } + + /* + * Check for areas already mapped as present in the new aperture + * range and mark those pages as reserved in the allocator. Such + * mappings may already exist as a result of requested unity + * mappings for devices. + */ + for (i = dma_dom->aperture[index]->offset; + i < dma_dom->aperture_size; + i += PAGE_SIZE) { + u64 *pte = fetch_pte(&dma_dom->domain, i); + if (!pte || !IOMMU_PTE_PRESENT(*pte)) + continue; + + dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); + } + + return 0; + +out_free: + free_page((unsigned long)dma_dom->aperture[index]->bitmap); + + kfree(dma_dom->aperture[index]); + dma_dom->aperture[index] = NULL; + + return -ENOMEM; +} + +static unsigned long dma_ops_area_alloc(struct device *dev, + struct dma_ops_domain *dom, + unsigned int pages, + unsigned long align_mask, + u64 dma_mask, + unsigned long start) +{ + unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE; + int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT; + int i = start >> APERTURE_RANGE_SHIFT; + unsigned long boundary_size; + unsigned long address = -1; + unsigned long limit; + + next_bit >>= PAGE_SHIFT; + + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + PAGE_SIZE) >> PAGE_SHIFT; + + for (;i < max_index; ++i) { + unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT; + + if (dom->aperture[i]->offset >= dma_mask) + break; + + limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset, + dma_mask >> PAGE_SHIFT); + + address = iommu_area_alloc(dom->aperture[i]->bitmap, + limit, next_bit, pages, 0, + boundary_size, align_mask); + if (address != -1) { + address = dom->aperture[i]->offset + + (address << PAGE_SHIFT); + dom->next_address = address + (pages << PAGE_SHIFT); + break; + } + + next_bit = 0; + } + + return address; +} + static unsigned long dma_ops_alloc_addresses(struct device *dev, struct dma_ops_domain *dom, unsigned int pages, unsigned long align_mask, u64 dma_mask) { - unsigned long limit; unsigned long address; - unsigned long boundary_size; - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; - limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, - dma_mask >> PAGE_SHIFT); +#ifdef CONFIG_IOMMU_STRESS + dom->next_address = 0; + dom->need_flush = true; +#endif - if (dom->next_bit >= limit) { - dom->next_bit = 0; - dom->need_flush = true; - } + address = dma_ops_area_alloc(dev, dom, pages, align_mask, + dma_mask, dom->next_address); - address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, - 0 , boundary_size, align_mask); if (address == -1) { - address = iommu_area_alloc(dom->bitmap, limit, 0, pages, - 0, boundary_size, align_mask); + dom->next_address = 0; + address = dma_ops_area_alloc(dev, dom, pages, align_mask, + dma_mask, 0); dom->need_flush = true; } - if (likely(address != -1)) { - dom->next_bit = address + pages; - address <<= PAGE_SHIFT; - } else + if (unlikely(address == -1)) address = bad_dma_address; WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); @@ -684,11 +862,23 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, unsigned long address, unsigned int pages) { - address >>= PAGE_SHIFT; - iommu_area_free(dom->bitmap, address, pages); + unsigned i = address >> APERTURE_RANGE_SHIFT; + struct aperture_range *range = dom->aperture[i]; + + BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); - if (address >= dom->next_bit) +#ifdef CONFIG_IOMMU_STRESS + if (i < 4) + return; +#endif + + if (address >= dom->next_address) dom->need_flush = true; + + address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; + + iommu_area_free(range->bitmap, address, pages); + } /**************************************************************************** @@ -736,12 +926,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages) { - unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; + unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; if (start_page + pages > last_page) pages = last_page - start_page; - iommu_area_reserve(dom->bitmap, start_page, pages); + for (i = start_page; i < start_page + pages; ++i) { + int index = i / APERTURE_RANGE_PAGES; + int page = i % APERTURE_RANGE_PAGES; + __set_bit(page, dom->aperture[index]->bitmap); + } } static void free_pagetable(struct protection_domain *domain) @@ -780,14 +974,19 @@ static void free_pagetable(struct protection_domain *domain) */ static void dma_ops_domain_free(struct dma_ops_domain *dom) { + int i; + if (!dom) return; free_pagetable(&dom->domain); - kfree(dom->pte_pages); - - kfree(dom->bitmap); + for (i = 0; i < APERTURE_MAX_RANGES; ++i) { + if (!dom->aperture[i]) + continue; + free_page((unsigned long)dom->aperture[i]->bitmap); + kfree(dom->aperture[i]); + } kfree(dom); } @@ -797,19 +996,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) * It also intializes the page table and the address allocator data * structures required for the dma_ops interface */ -static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, - unsigned order) +static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) { struct dma_ops_domain *dma_dom; - unsigned i, num_pte_pages; - u64 *l2_pde; - u64 address; - - /* - * Currently the DMA aperture must be between 32 MB and 1GB in size - */ - if ((order < 25) || (order > 30)) - return NULL; dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); if (!dma_dom) @@ -826,55 +1015,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->domain.priv = dma_dom; if (!dma_dom->domain.pt_root) goto free_dma_dom; - dma_dom->aperture_size = (1ULL << order); - dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8), - GFP_KERNEL); - if (!dma_dom->bitmap) - goto free_dma_dom; - /* - * mark the first page as allocated so we never return 0 as - * a valid dma-address. So we can use 0 as error value - */ - dma_dom->bitmap[0] = 1; - dma_dom->next_bit = 0; dma_dom->need_flush = false; dma_dom->target_dev = 0xffff; - /* Intialize the exclusion range if necessary */ - if (iommu->exclusion_start && - iommu->exclusion_start < dma_dom->aperture_size) { - unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = iommu_num_pages(iommu->exclusion_start, - iommu->exclusion_length, - PAGE_SIZE); - dma_ops_reserve_addresses(dma_dom, startpage, pages); - } + if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) + goto free_dma_dom; /* - * At the last step, build the page tables so we don't need to - * allocate page table pages in the dma_ops mapping/unmapping - * path. + * mark the first page as allocated so we never return 0 as + * a valid dma-address. So we can use 0 as error value */ - num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); - dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), - GFP_KERNEL); - if (!dma_dom->pte_pages) - goto free_dma_dom; + dma_dom->aperture[0]->bitmap[0] = 1; + dma_dom->next_address = 0; - l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL); - if (l2_pde == NULL) - goto free_dma_dom; - - dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); - - for (i = 0; i < num_pte_pages; ++i) { - dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!dma_dom->pte_pages[i]) - goto free_dma_dom; - address = virt_to_phys(dma_dom->pte_pages[i]); - l2_pde[i] = IOMMU_L1_PDE(address); - } return dma_dom; @@ -934,7 +1088,13 @@ static void attach_device(struct amd_iommu *iommu, amd_iommu_pd_table[devid] = domain; write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + /* + * We might boot into a crash-kernel here. The crashed kernel + * left the caches in the IOMMU dirty. So we have to flush + * here to evict all dirty stuff. + */ iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_tlb_pde(iommu, domain->id); } /* @@ -983,7 +1143,6 @@ static int device_change_notifier(struct notifier_block *nb, struct protection_domain *domain; struct dma_ops_domain *dma_domain; struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; unsigned long flags; if (devid > amd_iommu_last_bdf) @@ -1002,17 +1161,7 @@ static int device_change_notifier(struct notifier_block *nb, "to a non-dma-ops domain\n", dev_name(dev)); switch (action) { - case BUS_NOTIFY_BOUND_DRIVER: - if (domain) - goto out; - dma_domain = find_protection_domain(devid); - if (!dma_domain) - dma_domain = iommu->default_dom; - attach_device(iommu, &dma_domain->domain, devid); - printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " - "device %s\n", dma_domain->domain.id, dev_name(dev)); - break; - case BUS_NOTIFY_UNBIND_DRIVER: + case BUS_NOTIFY_UNBOUND_DRIVER: if (!domain) goto out; detach_device(domain, devid); @@ -1022,7 +1171,7 @@ static int device_change_notifier(struct notifier_block *nb, dma_domain = find_protection_domain(devid); if (dma_domain) goto out; - dma_domain = dma_ops_domain_alloc(iommu, order); + dma_domain = dma_ops_domain_alloc(iommu); if (!dma_domain) goto out; dma_domain->target_dev = devid; @@ -1043,7 +1192,7 @@ out: return 0; } -struct notifier_block device_nb = { +static struct notifier_block device_nb = { .notifier_call = device_change_notifier, }; @@ -1133,8 +1282,8 @@ static int get_device_resources(struct device *dev, dma_dom = (*iommu)->default_dom; *domain = &dma_dom->domain; attach_device(*iommu, *domain, *bdf); - printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " - "device %s\n", (*domain)->id, dev_name(dev)); + DUMP_printk("Using protection domain %d for device %s\n", + (*domain)->id, dev_name(dev)); } if (domain_for_device(_bdf) == NULL) @@ -1144,6 +1293,66 @@ static int get_device_resources(struct device *dev, } /* + * If the pte_page is not yet allocated this function is called + */ +static u64* alloc_pte(struct protection_domain *dom, + unsigned long address, u64 **pte_page, gfp_t gfp) +{ + u64 *pte, *page; + + pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = IOMMU_L2_PDE(virt_to_phys(page)); + } + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = IOMMU_L1_PDE(virt_to_phys(page)); + } + + pte = IOMMU_PTE_PAGE(*pte); + + if (pte_page) + *pte_page = pte; + + pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + + return pte; +} + +/* + * This function fetches the PTE for a given address in the aperture + */ +static u64* dma_ops_get_pte(struct dma_ops_domain *dom, + unsigned long address) +{ + struct aperture_range *aperture; + u64 *pte, *pte_page; + + aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; + if (!aperture) + return NULL; + + pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; + if (!pte) { + pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); + aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; + } else + pte += IOMMU_PTE_L0_INDEX(address); + + return pte; +} + +/* * This is the generic map function. It maps one 4kb page at paddr to * the given address in the DMA address space for the domain. */ @@ -1159,8 +1368,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, paddr &= PAGE_MASK; - pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; - pte += IOMMU_PTE_L0_INDEX(address); + pte = dma_ops_get_pte(dom, address); + if (!pte) + return bad_dma_address; __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; @@ -1185,14 +1395,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, struct dma_ops_domain *dom, unsigned long address) { + struct aperture_range *aperture; u64 *pte; if (address >= dom->aperture_size) return; - WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); + aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; + if (!aperture) + return; + + pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; + if (!pte) + return; - pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; pte += IOMMU_PTE_L0_INDEX(address); WARN_ON(!*pte); @@ -1216,7 +1432,7 @@ static dma_addr_t __map_single(struct device *dev, u64 dma_mask) { dma_addr_t offset = paddr & ~PAGE_MASK; - dma_addr_t address, start; + dma_addr_t address, start, ret; unsigned int pages; unsigned long align_mask = 0; int i; @@ -1232,14 +1448,33 @@ static dma_addr_t __map_single(struct device *dev, if (align) align_mask = (1UL << get_order(size)) - 1; +retry: address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, dma_mask); - if (unlikely(address == bad_dma_address)) - goto out; + if (unlikely(address == bad_dma_address)) { + /* + * setting next_address here will let the address + * allocator only scan the new allocated range in the + * first run. This is a small optimization. + */ + dma_dom->next_address = dma_dom->aperture_size; + + if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) + goto out; + + /* + * aperture was sucessfully enlarged by 128 MB, try + * allocation again + */ + goto retry; + } start = address; for (i = 0; i < pages; ++i) { - dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); + ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); + if (ret == bad_dma_address) + goto out_unmap; + paddr += PAGE_SIZE; start += PAGE_SIZE; } @@ -1255,6 +1490,17 @@ static dma_addr_t __map_single(struct device *dev, out: return address; + +out_unmap: + + for (--i; i >= 0; --i) { + start -= PAGE_SIZE; + dma_ops_domain_unmap(iommu, dma_dom, start); + } + + dma_ops_free_addresses(dma_dom, address, pages); + + return bad_dma_address; } /* @@ -1517,7 +1763,7 @@ static void *alloc_coherent(struct device *dev, size_t size, flag |= __GFP_ZERO; virt_addr = (void *)__get_free_pages(flag, get_order(size)); if (!virt_addr) - return 0; + return NULL; paddr = virt_to_phys(virt_addr); @@ -1537,8 +1783,10 @@ static void *alloc_coherent(struct device *dev, size_t size, *dma_addr = __map_single(dev, iommu, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); - if (*dma_addr == bad_dma_address) + if (*dma_addr == bad_dma_address) { + spin_unlock_irqrestore(&domain->lock, flags); goto out_free; + } iommu_completion_wait(iommu); @@ -1625,7 +1873,6 @@ static void prealloc_protection_domains(void) struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { @@ -1638,7 +1885,7 @@ static void prealloc_protection_domains(void) iommu = amd_iommu_rlookup_table[devid]; if (!iommu) continue; - dma_dom = dma_ops_domain_alloc(iommu, order); + dma_dom = dma_ops_domain_alloc(iommu); if (!dma_dom) continue; init_unity_mappings_for_device(dma_dom, devid); @@ -1664,7 +1911,6 @@ static struct dma_map_ops amd_iommu_dma_ops = { int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; int ret; /* @@ -1672,8 +1918,8 @@ int __init amd_iommu_init_dma_ops(void) * found in the system. Devices not assigned to any other * protection domain will be assigned to the default one. */ - list_for_each_entry(iommu, &amd_iommu_list, list) { - iommu->default_dom = dma_ops_domain_alloc(iommu, order); + for_each_iommu(iommu) { + iommu->default_dom = dma_ops_domain_alloc(iommu); if (iommu->default_dom == NULL) return -ENOMEM; iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; @@ -1710,7 +1956,7 @@ int __init amd_iommu_init_dma_ops(void) free_domains: - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { if (iommu->default_dom) dma_ops_domain_free(iommu->default_dom); } @@ -1842,7 +2088,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, old_domain = domain_for_device(devid); if (old_domain) - return -EBUSY; + detach_device(old_domain, devid); attach_device(iommu, domain, devid); diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be0902da..c1b17e97252 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -115,15 +115,21 @@ struct ivmd_header { u64 range_length; } __attribute__((packed)); +bool amd_iommu_dump; + static int __initdata amd_iommu_detected; u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ -unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ +#ifdef CONFIG_IOMMU_STRESS +bool amd_iommu_isolate = false; +#else bool amd_iommu_isolate = true; /* if true, device isolation is enabled */ +#endif + bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the @@ -175,7 +181,7 @@ static inline void update_last_devid(u16 devid) static inline unsigned long tbl_size(int entry_size) { unsigned shift = PAGE_SHIFT + - get_order(amd_iommu_last_bdf * entry_size); + get_order(((int)amd_iommu_last_bdf + 1) * entry_size); return 1UL << shift; } @@ -193,7 +199,7 @@ static inline unsigned long tbl_size(int entry_size) * This function set the exclusion range in the IOMMU. DMA accesses to the * exclusion range are passed through untranslated */ -static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) +static void iommu_set_exclusion_range(struct amd_iommu *iommu) { u64 start = iommu->exclusion_start & PAGE_MASK; u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; @@ -225,7 +231,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu) } /* Generic functions to enable/disable certain features of the IOMMU. */ -static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) +static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; @@ -244,7 +250,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) } /* Function to enable the hardware */ -static void __init iommu_enable(struct amd_iommu *iommu) +static void iommu_enable(struct amd_iommu *iommu) { printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", dev_name(&iommu->dev->dev), iommu->cap_ptr); @@ -252,11 +258,17 @@ static void __init iommu_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } -/* Function to enable IOMMU event logging and event interrupts */ -static void __init iommu_enable_event_logging(struct amd_iommu *iommu) +static void iommu_disable(struct amd_iommu *iommu) { - iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); - iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); + /* Disable command buffer */ + iommu_feature_disable(iommu, CONTROL_CMDBUF_EN); + + /* Disable event logging and event interrupts */ + iommu_feature_disable(iommu, CONTROL_EVT_INT_EN); + iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN); + + /* Disable IOMMU hardware itself */ + iommu_feature_disable(iommu, CONTROL_IOMMU_EN); } /* @@ -413,25 +425,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) { u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(CMD_BUFFER_SIZE)); - u64 entry; if (cmd_buf == NULL) return NULL; iommu->cmd_buf_size = CMD_BUFFER_SIZE; - entry = (u64)virt_to_phys(cmd_buf); + return cmd_buf; +} + +/* + * This function writes the command buffer address to the hardware and + * enables it. + */ +static void iommu_enable_command_buffer(struct amd_iommu *iommu) +{ + u64 entry; + + BUG_ON(iommu->cmd_buf == NULL); + + entry = (u64)virt_to_phys(iommu->cmd_buf); entry |= MMIO_CMD_SIZE_512; + memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, - &entry, sizeof(entry)); + &entry, sizeof(entry)); /* set head and tail to zero manually */ writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); - - return cmd_buf; } static void __init free_command_buffer(struct amd_iommu *iommu) @@ -443,20 +466,33 @@ static void __init free_command_buffer(struct amd_iommu *iommu) /* allocates the memory where the IOMMU will log its events to */ static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) { - u64 entry; iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(EVT_BUFFER_SIZE)); if (iommu->evt_buf == NULL) return NULL; + iommu->evt_buf_size = EVT_BUFFER_SIZE; + + return iommu->evt_buf; +} + +static void iommu_enable_event_buffer(struct amd_iommu *iommu) +{ + u64 entry; + + BUG_ON(iommu->evt_buf == NULL); + entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; + memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, &entry, sizeof(entry)); - iommu->evt_buf_size = EVT_BUFFER_SIZE; + /* set head and tail to zero manually */ + writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); + writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); - return iommu->evt_buf; + iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); } static void __init free_event_buffer(struct amd_iommu *iommu) @@ -596,32 +632,84 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, p += sizeof(struct ivhd_header); end += h->length; + while (p < end) { e = (struct ivhd_entry *)p; switch (e->type) { case IVHD_DEV_ALL: + + DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x" + " last device %02x:%02x.%x flags: %02x\n", + PCI_BUS(iommu->first_device), + PCI_SLOT(iommu->first_device), + PCI_FUNC(iommu->first_device), + PCI_BUS(iommu->last_device), + PCI_SLOT(iommu->last_device), + PCI_FUNC(iommu->last_device), + e->flags); + for (dev_i = iommu->first_device; dev_i <= iommu->last_device; ++dev_i) set_dev_entry_from_acpi(iommu, dev_i, e->flags, 0); break; case IVHD_DEV_SELECT: + + DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x " + "flags: %02x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags); + devid = e->devid; set_dev_entry_from_acpi(iommu, devid, e->flags, 0); break; case IVHD_DEV_SELECT_RANGE_START: + + DUMP_printk(" DEV_SELECT_RANGE_START\t " + "devid: %02x:%02x.%x flags: %02x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags); + devid_start = e->devid; flags = e->flags; ext_flags = 0; alias = false; break; case IVHD_DEV_ALIAS: + + DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x " + "flags: %02x devid_to: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, + PCI_BUS(e->ext >> 8), + PCI_SLOT(e->ext >> 8), + PCI_FUNC(e->ext >> 8)); + devid = e->devid; devid_to = e->ext >> 8; - set_dev_entry_from_acpi(iommu, devid, e->flags, 0); + set_dev_entry_from_acpi(iommu, devid , e->flags, 0); + set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; case IVHD_DEV_ALIAS_RANGE: + + DUMP_printk(" DEV_ALIAS_RANGE\t\t " + "devid: %02x:%02x.%x flags: %02x " + "devid_to: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, + PCI_BUS(e->ext >> 8), + PCI_SLOT(e->ext >> 8), + PCI_FUNC(e->ext >> 8)); + devid_start = e->devid; flags = e->flags; devid_to = e->ext >> 8; @@ -629,24 +717,48 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, alias = true; break; case IVHD_DEV_EXT_SELECT: + + DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x " + "flags: %02x ext: %08x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, e->ext); + devid = e->devid; set_dev_entry_from_acpi(iommu, devid, e->flags, e->ext); break; case IVHD_DEV_EXT_SELECT_RANGE: + + DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: " + "%02x:%02x.%x flags: %02x ext: %08x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, e->ext); + devid_start = e->devid; flags = e->flags; ext_flags = e->ext; alias = false; break; case IVHD_DEV_RANGE_END: + + DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid)); + devid = e->devid; for (dev_i = devid_start; dev_i <= devid; ++dev_i) { - if (alias) + if (alias) { amd_iommu_alias_table[dev_i] = devid_to; - set_dev_entry_from_acpi(iommu, - amd_iommu_alias_table[dev_i], - flags, ext_flags); + set_dev_entry_from_acpi(iommu, + devid_to, flags, ext_flags); + } + set_dev_entry_from_acpi(iommu, dev_i, + flags, ext_flags); } break; default: @@ -679,7 +791,7 @@ static void __init free_iommu_all(void) { struct amd_iommu *iommu, *next; - list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { + for_each_iommu_safe(iommu, next) { list_del(&iommu->list); free_iommu_one(iommu); kfree(iommu); @@ -710,7 +822,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) if (!iommu->mmio_base) return -ENOMEM; - iommu_set_device_table(iommu); iommu->cmd_buf = alloc_command_buffer(iommu); if (!iommu->cmd_buf) return -ENOMEM; @@ -746,6 +857,15 @@ static int __init init_iommu_all(struct acpi_table_header *table) h = (struct ivhd_header *)p; switch (*p) { case ACPI_IVHD_TYPE: + + DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x " + "seg: %d flags: %01x info %04x\n", + PCI_BUS(h->devid), PCI_SLOT(h->devid), + PCI_FUNC(h->devid), h->cap_ptr, + h->pci_seg, h->flags, h->info); + DUMP_printk(" mmio-addr: %016llx\n", + h->mmio_phys); + iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); if (iommu == NULL) return -ENOMEM; @@ -773,56 +893,9 @@ static int __init init_iommu_all(struct acpi_table_header *table) * ****************************************************************************/ -static int __init iommu_setup_msix(struct amd_iommu *iommu) -{ - struct amd_iommu *curr; - struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */ - int nvec = 0, i; - - list_for_each_entry(curr, &amd_iommu_list, list) { - if (curr->dev == iommu->dev) { - entries[nvec].entry = curr->evt_msi_num; - entries[nvec].vector = 0; - curr->int_enabled = true; - nvec++; - } - } - - if (pci_enable_msix(iommu->dev, entries, nvec)) { - pci_disable_msix(iommu->dev); - return 1; - } - - for (i = 0; i < nvec; ++i) { - int r = request_irq(entries->vector, amd_iommu_int_handler, - IRQF_SAMPLE_RANDOM, - "AMD IOMMU", - NULL); - if (r) - goto out_free; - } - - return 0; - -out_free: - for (i -= 1; i >= 0; --i) - free_irq(entries->vector, NULL); - - pci_disable_msix(iommu->dev); - - return 1; -} - static int __init iommu_setup_msi(struct amd_iommu *iommu) { int r; - struct amd_iommu *curr; - - list_for_each_entry(curr, &amd_iommu_list, list) { - if (curr->dev == iommu->dev) - curr->int_enabled = true; - } - if (pci_enable_msi(iommu->dev)) return 1; @@ -837,17 +910,18 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) return 1; } + iommu->int_enabled = true; + iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); + return 0; } -static int __init iommu_init_msi(struct amd_iommu *iommu) +static int iommu_init_msi(struct amd_iommu *iommu) { if (iommu->int_enabled) return 0; - if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) - return iommu_setup_msix(iommu); - else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) + if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) return iommu_setup_msi(iommu); return 1; @@ -899,6 +973,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) static int __init init_unity_map_range(struct ivmd_header *m) { struct unity_map_entry *e = 0; + char *s; e = kzalloc(sizeof(*e), GFP_KERNEL); if (e == NULL) @@ -906,14 +981,19 @@ static int __init init_unity_map_range(struct ivmd_header *m) switch (m->type) { default: + kfree(e); + return 0; case ACPI_IVMD_TYPE: + s = "IVMD_TYPEi\t\t\t"; e->devid_start = e->devid_end = m->devid; break; case ACPI_IVMD_TYPE_ALL: + s = "IVMD_TYPE_ALL\t\t"; e->devid_start = 0; e->devid_end = amd_iommu_last_bdf; break; case ACPI_IVMD_TYPE_RANGE: + s = "IVMD_TYPE_RANGE\t\t"; e->devid_start = m->devid; e->devid_end = m->aux; break; @@ -922,6 +1002,13 @@ static int __init init_unity_map_range(struct ivmd_header *m) e->address_end = e->address_start + PAGE_ALIGN(m->range_length); e->prot = m->flags >> 1; + DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x" + " range_start: %016llx range_end: %016llx flags: %x\n", s, + PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start), + PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end), + PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end), + e->address_start, e->address_end, m->flags); + list_add_tail(&e->list, &amd_iommu_unity_map); return 0; @@ -967,18 +1054,29 @@ static void init_device_table(void) * This function finally enables all IOMMUs found in the system after * they have been initialized */ -static void __init enable_iommus(void) +static void enable_iommus(void) { struct amd_iommu *iommu; - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { + iommu_disable(iommu); + iommu_set_device_table(iommu); + iommu_enable_command_buffer(iommu); + iommu_enable_event_buffer(iommu); iommu_set_exclusion_range(iommu); iommu_init_msi(iommu); - iommu_enable_event_logging(iommu); iommu_enable(iommu); } } +static void disable_iommus(void) +{ + struct amd_iommu *iommu; + + for_each_iommu(iommu) + iommu_disable(iommu); +} + /* * Suspend/Resume support * disable suspend until real resume implemented @@ -986,12 +1084,25 @@ static void __init enable_iommus(void) static int amd_iommu_resume(struct sys_device *dev) { + /* re-load the hardware */ + enable_iommus(); + + /* + * we have to flush after the IOMMUs are enabled because a + * disabled IOMMU will never execute the commands we send + */ + amd_iommu_flush_all_devices(); + amd_iommu_flush_all_domains(); + return 0; } static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) { - return -EINVAL; + /* disable IOMMUs to go out of the way for BIOS */ + disable_iommus(); + + return 0; } static struct sysdev_class amd_iommu_sysdev_class = { @@ -1137,9 +1248,6 @@ int __init amd_iommu_init(void) enable_iommus(); - printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n", - (1 << (amd_iommu_aperture_order-20))); - printk(KERN_INFO "AMD IOMMU: device isolation "); if (amd_iommu_isolate) printk("enabled\n"); @@ -1177,6 +1285,11 @@ free: goto out; } +void amd_iommu_shutdown(void) +{ + disable_iommus(); +} + /**************************************************************************** * * Early detect code. This code runs at IOMMU detection time in the DMA @@ -1211,6 +1324,13 @@ void __init amd_iommu_detect(void) * ****************************************************************************/ +static int __init parse_amd_iommu_dump(char *str) +{ + amd_iommu_dump = true; + + return 1; +} + static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { @@ -1225,15 +1345,5 @@ static int __init parse_amd_iommu_options(char *str) return 1; } -static int __init parse_amd_iommu_size_options(char *str) -{ - unsigned order = PAGE_SHIFT + get_order(memparse(str, &str)); - - if ((order > 24) && (order < 31)) - amd_iommu_aperture_order = order; - - return 1; -} - +__setup("amd_iommu_dump", parse_amd_iommu_dump); __setup("amd_iommu=", parse_amd_iommu_options); -__setup("amd_iommu_size=", parse_amd_iommu_size_options); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f2870920f24..0a1c2830ec6 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,6 +14,7 @@ * Mikael Pettersson : PM converted to driver model. */ +#include <linux/perf_counter.h> #include <linux/kernel_stat.h> #include <linux/mc146818rtc.h> #include <linux/acpi_pmtmr.h> @@ -34,6 +35,7 @@ #include <linux/smp.h> #include <linux/mm.h> +#include <asm/perf_counter.h> #include <asm/pgalloc.h> #include <asm/atomic.h> #include <asm/mpspec.h> @@ -98,6 +100,29 @@ early_param("lapic", parse_lapic); /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase; +/* + * Handle interrupt mode configuration register (IMCR). + * This register controls whether the interrupt signals + * that reach the BSP come from the master PIC or from the + * local APIC. Before entering Symmetric I/O Mode, either + * the BIOS or the operating system must switch out of + * PIC Mode by changing the IMCR. + */ +static inline void imcr_pic_to_apic(void) +{ + /* select IMCR register */ + outb(0x70, 0x22); + /* NMI and 8259 INTR go through APIC */ + outb(0x01, 0x23); +} + +static inline void imcr_apic_to_pic(void) +{ + /* select IMCR register */ + outb(0x70, 0x22); + /* NMI and 8259 INTR go directly to BSP */ + outb(0x00, 0x23); +} #endif #ifdef CONFIG_X86_64 @@ -111,14 +136,18 @@ static __init int setup_apicpmtimer(char *s) __setup("apicpmtimer", setup_apicpmtimer); #endif +int x2apic_mode; #ifdef CONFIG_X86_X2APIC -int x2apic; /* x2apic enabled before OS handover */ static int x2apic_preenabled; -static int disable_x2apic; static __init int setup_nox2apic(char *str) { - disable_x2apic = 1; + if (x2apic_enabled()) { + pr_warning("Bios already enabled x2apic, " + "can't enforce nox2apic"); + return 0; + } + setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; } @@ -209,6 +238,31 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } +/* + * bare function to substitute write operation + * and it's _that_ fast :) + */ +static void native_apic_write_dummy(u32 reg, u32 v) +{ + WARN_ON_ONCE((cpu_has_apic || !disable_apic)); +} + +static u32 native_apic_read_dummy(u32 reg) +{ + WARN_ON_ONCE((cpu_has_apic && !disable_apic)); + return 0; +} + +/* + * right after this call apic->write/read doesn't do anything + * note that there is no restore operation it works one way + */ +void apic_disable(void) +{ + apic->read = native_apic_read_dummy; + apic->write = native_apic_write_dummy; +} + void native_apic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) @@ -348,7 +402,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) { - unsigned long reg = (lvt_off << 4) + APIC_EILVT0; + unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0); unsigned int v = (mask << 16) | (msg_type << 8) | vector; apic_write(reg, v); @@ -815,7 +869,7 @@ void clear_local_APIC(void) u32 v; /* APIC hasn't been mapped yet */ - if (!x2apic && !apic_phys) + if (!x2apic_mode && !apic_phys) return; maxlvt = lapic_get_maxlvt(); @@ -843,7 +897,7 @@ void clear_local_APIC(void) } /* lets not touch this if we didn't frob it */ -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) +#ifdef CONFIG_X86_THERMAL_VECTOR if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); @@ -1133,6 +1187,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif + perf_counters_lapic_init(); preempt_disable(); @@ -1287,7 +1342,7 @@ void check_x2apic(void) { if (x2apic_enabled()) { pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); - x2apic_preenabled = x2apic = 1; + x2apic_preenabled = x2apic_mode = 1; } } @@ -1295,7 +1350,7 @@ void enable_x2apic(void) { int msr, msr2; - if (!x2apic) + if (!x2apic_mode) return; rdmsr(MSR_IA32_APICBASE, msr, msr2); @@ -1304,6 +1359,7 @@ void enable_x2apic(void) wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); } } +#endif /* CONFIG_X86_X2APIC */ void __init enable_IR_x2apic(void) { @@ -1312,32 +1368,21 @@ void __init enable_IR_x2apic(void) unsigned long flags; struct IO_APIC_route_entry **ioapic_entries = NULL; - if (!cpu_has_x2apic) - return; - - if (!x2apic_preenabled && disable_x2apic) { - pr_info("Skipped enabling x2apic and Interrupt-remapping " - "because of nox2apic\n"); - return; + ret = dmar_table_init(); + if (ret) { + pr_debug("dmar_table_init() failed with %d:\n", ret); + goto ir_failed; } - if (x2apic_preenabled && disable_x2apic) - panic("Bios already enabled x2apic, can't enforce nox2apic"); - - if (!x2apic_preenabled && skip_ioapic_setup) { - pr_info("Skipped enabling x2apic and Interrupt-remapping " - "because of skipping io-apic setup\n"); - return; + if (!intr_remapping_supported()) { + pr_debug("intr-remapping not supported\n"); + goto ir_failed; } - ret = dmar_table_init(); - if (ret) { - pr_info("dmar_table_init() failed with %d:\n", ret); - if (x2apic_preenabled) - panic("x2apic enabled by bios. But IR enabling failed"); - else - pr_info("Not enabling x2apic,Intr-remapping\n"); + if (!x2apic_preenabled && skip_ioapic_setup) { + pr_info("Skipped enabling intr-remap because of skipping " + "io-apic setup\n"); return; } @@ -1357,19 +1402,16 @@ void __init enable_IR_x2apic(void) mask_IO_APIC_setup(ioapic_entries); mask_8259A(); - ret = enable_intr_remapping(EIM_32BIT_APIC_ID); - - if (ret && x2apic_preenabled) { - local_irq_restore(flags); - panic("x2apic enabled by bios. But IR enabling failed"); - } - + ret = enable_intr_remapping(x2apic_supported()); if (ret) goto end_restore; - if (!x2apic) { - x2apic = 1; + pr_info("Enabled Interrupt-remapping\n"); + + if (x2apic_supported() && !x2apic_mode) { + x2apic_mode = 1; enable_x2apic(); + pr_info("Enabled x2apic\n"); } end_restore: @@ -1378,37 +1420,34 @@ end_restore: * IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - else - reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries); unmask_8259A(); local_irq_restore(flags); end: - if (!ret) { - if (!x2apic_preenabled) - pr_info("Enabled x2apic and interrupt-remapping\n"); - else - pr_info("Enabled Interrupt-remapping\n"); - } else - pr_err("Failed to enable Interrupt-remapping and x2apic\n"); if (ioapic_entries) free_ioapic_entries(ioapic_entries); + + if (!ret) + return; + +ir_failed: + if (x2apic_preenabled) + panic("x2apic enabled by bios. But IR enabling failed"); + else if (cpu_has_x2apic) + pr_info("Not enabling x2apic,Intr-remapping\n"); #else if (!cpu_has_x2apic) return; if (x2apic_preenabled) panic("x2apic enabled prior OS handover," - " enable CONFIG_INTR_REMAP"); - - pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping " - " and x2apic\n"); + " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP"); #endif return; } -#endif /* CONFIG_X86_X2APIC */ + #ifdef CONFIG_X86_64 /* @@ -1425,7 +1464,6 @@ static int __init detect_init_APIC(void) } mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - boot_cpu_physical_apicid = 0; return 0; } #else @@ -1539,32 +1577,49 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { - if (x2apic) { + unsigned int new_apicid; + + if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); return; } - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ + /* If no local APIC can be found return early */ if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else + /* lets NOP'ify apic operations */ + pr_info("APIC: disable apic facility\n"); + apic_disable(); + } else { apic_phys = mp_lapic_addr; - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", - APIC_BASE, apic_phys); + /* + * acpi lapic path already maps that address in + * acpi_register_lapic_address() + */ + if (!acpi_lapic) + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + + apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", + APIC_BASE, apic_phys); + } /* * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ - if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = read_apic_id(); + new_apicid = read_apic_id(); + if (boot_cpu_physical_apicid != new_apicid) { + boot_cpu_physical_apicid = new_apicid; + /* + * yeah -- we lie about apic_version + * in case if apic was disabled via boot option + * but it's not a problem for SMP compiled kernel + * since smp_sanity_check is prepared for such a case + * and disable smp mode + */ + apic_version[new_apicid] = + GET_APIC_VERSION(apic_read(APIC_LVR)); + } } /* @@ -1733,8 +1788,7 @@ void __init connect_bsp_APIC(void) */ apic_printk(APIC_VERBOSE, "leaving PIC mode, " "enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); + imcr_pic_to_apic(); } #endif if (apic->enable_apic_mode) @@ -1762,8 +1816,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) */ apic_printk(APIC_VERBOSE, "disabling APIC mode, " "entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); + imcr_apic_to_pic(); return; } #endif @@ -1962,17 +2015,17 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) +#ifdef CONFIG_X86_THERMAL_VECTOR if (maxlvt >= 5) apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); #endif local_irq_save(flags); disable_local_APIC(); -#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) disable_intr_remapping(); -#endif + local_irq_restore(flags); return 0; } @@ -1982,42 +2035,34 @@ static int lapic_resume(struct sys_device *dev) unsigned int l, h; unsigned long flags; int maxlvt; - -#ifdef CONFIG_INTR_REMAP - int ret; + int ret = 0; struct IO_APIC_route_entry **ioapic_entries = NULL; if (!apic_pm_state.active) return 0; local_irq_save(flags); - if (x2apic) { + if (intr_remapping_enabled) { ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { WARN(1, "Alloc ioapic_entries in lapic resume failed."); - return -ENOMEM; + ret = -ENOMEM; + goto restore; } ret = save_IO_APIC_setup(ioapic_entries); if (ret) { WARN(1, "Saving IO-APIC state failed: %d\n", ret); free_ioapic_entries(ioapic_entries); - return ret; + goto restore; } mask_IO_APIC_setup(ioapic_entries); mask_8259A(); - enable_x2apic(); } -#else - if (!apic_pm_state.active) - return 0; - local_irq_save(flags); - if (x2apic) + if (x2apic_mode) enable_x2apic(); -#endif - else { /* * Make sure the APICBASE points to the right address @@ -2055,21 +2100,16 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_ESR, 0); apic_read(APIC_ESR); -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - reenable_intr_remapping(EIM_32BIT_APIC_ID); - - if (x2apic) { + if (intr_remapping_enabled) { + reenable_intr_remapping(x2apic_mode); unmask_8259A(); restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); } -#endif - +restore: local_irq_restore(flags); - - return 0; + return ret; } /* @@ -2117,31 +2157,14 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ #ifdef CONFIG_X86_64 -/* - * apic_is_clustered_box() -- Check if we can expect good TSC - * - * Thus far, the major user of this is IBM's Summit2 series: - * - * Clustered boxes may have unsynced TSC problems if they are - * multi-chassis. Use available data to take a good guess. - * If in doubt, go HPET. - */ -__cpuinit int apic_is_clustered_box(void) + +static int __cpuinit apic_cluster_num(void) { int i, clusters, zeros; unsigned id; u16 *bios_cpu_apicid; DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - /* - * there is not this kind of box with AMD CPU yet. - * Some AMD box with quadcore cpu and 8 sockets apicid - * will be [4, 0x23] or [8, 0x27] could be thought to - * vsmp box still need checking... - */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) - return 0; - bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); @@ -2177,18 +2200,67 @@ __cpuinit int apic_is_clustered_box(void) ++zeros; } - /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are - * not guaranteed to be synced between boards - */ - if (is_vsmp_box() && clusters > 1) + return clusters; +} + +static int __cpuinitdata multi_checked; +static int __cpuinitdata multi; + +static int __cpuinit set_multi(const struct dmi_system_id *d) +{ + if (multi) + return 0; + pr_info("APIC: %s detected, Multi Chassis\n", d->ident); + multi = 1; + return 0; +} + +static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = { + { + .callback = set_multi, + .ident = "IBM System Summit2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"), + }, + }, + {} +}; + +static void __cpuinit dmi_check_multi(void) +{ + if (multi_checked) + return; + + dmi_check_system(multi_dmi_table); + multi_checked = 1; +} + +/* + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. + * Use DMI to check them + */ +__cpuinit int apic_is_clustered_box(void) +{ + dmi_check_multi(); + if (multi) return 1; + if (!is_vsmp_box()) + return 0; + /* - * If clusters > 2, then should be multi-chassis. - * May have to revisit this when multi-core + hyperthreaded CPUs come - * out, but AFAIK this will work even for them. + * ScaleMP vSMPowered boxes have one cluster per board and TSCs are + * not guaranteed to be synced between boards */ - return (clusters > 2); + if (apic_cluster_num() > 1) + return 1; + + return 0; } #endif diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 306e5e88fb6..d0c99abc26c 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -161,7 +161,7 @@ static int flat_apic_id_registered(void) static int flat_phys_pkg_id(int initial_apic_id, int index_msb) { - return hard_smp_processor_id() >> index_msb; + return initial_apic_id >> index_msb; } struct apic apic_flat = { @@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) * regardless of how many processors are present (x86_64 ES7000 * is an example). */ - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { printk(KERN_DEBUG "system APIC only can use physical flat"); return 1; diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 30294777557..8952a589028 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi) return gsi; } -static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) { unsigned long vect = 0, psaival = 0; @@ -652,7 +652,8 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem, return ret && es7000_apic_is_cluster(); } -struct apic apic_es7000_cluster = { +/* We've been warned by a false positive warning.Use __refdata to keep calm. */ +struct apic __refdata apic_es7000_cluster = { .name = "es7000", .probe = probe_es7000, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 30da617d18e..d2ed6c5ddc8 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -59,6 +59,7 @@ #include <asm/setup.h> #include <asm/irq_remapping.h> #include <asm/hpet.h> +#include <asm/hw_irq.h> #include <asm/uv/uv_hub.h> #include <asm/uv/uv_irq.h> @@ -129,12 +130,9 @@ struct irq_pin_list { struct irq_pin_list *next; }; -static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) +static struct irq_pin_list *get_one_free_irq_2_pin(int node) { struct irq_pin_list *pin; - int node; - - node = cpu_to_node(cpu); pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); @@ -148,9 +146,6 @@ struct irq_cfg { unsigned move_cleanup_count; u8 vector; u8 move_in_progress : 1; -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - u8 move_desc_pending : 1; -#endif }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ @@ -182,16 +177,18 @@ int __init arch_early_irq_init(void) struct irq_cfg *cfg; struct irq_desc *desc; int count; + int node; int i; cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); + node= cpu_to_node(boot_cpu_id); for (i = 0; i < count; i++) { desc = irq_to_desc(i); desc->chip_data = &cfg[i]; - alloc_bootmem_cpumask_var(&cfg[i].domain); - alloc_bootmem_cpumask_var(&cfg[i].old_domain); + zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); + zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); if (i < NR_IRQS_LEGACY) cpumask_setall(cfg[i].domain); } @@ -212,12 +209,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq) return cfg; } -static struct irq_cfg *get_one_free_irq_cfg(int cpu) +static struct irq_cfg *get_one_free_irq_cfg(int node) { struct irq_cfg *cfg; - int node; - - node = cpu_to_node(cpu); cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); if (cfg) { @@ -238,13 +232,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu) return cfg; } -int arch_init_chip_data(struct irq_desc *desc, int cpu) +int arch_init_chip_data(struct irq_desc *desc, int node) { struct irq_cfg *cfg; cfg = desc->chip_data; if (!cfg) { - desc->chip_data = get_one_free_irq_cfg(cpu); + desc->chip_data = get_one_free_irq_cfg(node); if (!desc->chip_data) { printk(KERN_ERR "can not alloc irq_cfg\n"); BUG_ON(1); @@ -254,10 +248,9 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu) return 0; } -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - +/* for move_irq_desc */ static void -init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) +init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node) { struct irq_pin_list *old_entry, *head, *tail, *entry; @@ -266,7 +259,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) if (!old_entry) return; - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) return; @@ -276,7 +269,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) tail = entry; old_entry = old_entry->next; while (old_entry) { - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) { entry = head; while (entry) { @@ -316,12 +309,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) } void arch_init_copy_chip_data(struct irq_desc *old_desc, - struct irq_desc *desc, int cpu) + struct irq_desc *desc, int node) { struct irq_cfg *cfg; struct irq_cfg *old_cfg; - cfg = get_one_free_irq_cfg(cpu); + cfg = get_one_free_irq_cfg(node); if (!cfg) return; @@ -332,7 +325,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc, memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); - init_copy_irq_2_pin(old_cfg, cfg, cpu); + init_copy_irq_2_pin(old_cfg, cfg, node); } static void free_irq_cfg(struct irq_cfg *old_cfg) @@ -356,19 +349,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) old_desc->chip_data = NULL; } } - -static void -set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg = desc->chip_data; - - if (!cfg->move_in_progress) { - /* it means that domain is not changed */ - if (!cpumask_intersects(desc->affinity, mask)) - cfg->move_desc_pending = 1; - } -} -#endif +/* end for move_irq_desc */ #else static struct irq_cfg *irq_cfg(unsigned int irq) @@ -378,13 +359,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq) #endif -#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC -static inline void -set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) -{ -} -#endif - struct io_apic { unsigned int index; unsigned int unused[3]; @@ -488,7 +462,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { - union entry_union eu; + union entry_union eu = {{0, 0}}; + eu.entry = e; io_apic_write(apic, 0x11 + 2*pin, eu.w2); io_apic_write(apic, 0x10 + 2*pin, eu.w1); @@ -518,132 +493,18 @@ static void ioapic_mask_entry(int apic, int pin) spin_unlock_irqrestore(&ioapic_lock, flags); } -#ifdef CONFIG_SMP -static void send_cleanup_vector(struct irq_cfg *cfg) -{ - cpumask_var_t cleanup_mask; - - if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { - unsigned int i; - cfg->move_cleanup_count = 0; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - cfg->move_cleanup_count++; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); - } else { - cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); - cfg->move_cleanup_count = cpumask_weight(cleanup_mask); - apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - free_cpumask_var(cleanup_mask); - } - cfg->move_in_progress = 0; -} - -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - u8 vector = cfg->vector; - - entry = cfg->irq_2_pin; - for (;;) { - unsigned int reg; - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - /* - * With interrupt-remapping, destination information comes - * from interrupt-remapping table entry. - */ - if (!irq_remapped(irq)) - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } -} - -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); - -/* - * Either sets desc->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that, or returns BAD_APICID and - * leaves desc->affinity untouched. - */ -static unsigned int -set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg; - unsigned int irq; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return BAD_APICID; - - irq = desc->irq; - cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, mask)) - return BAD_APICID; - - /* check that before desc->addinity get updated */ - set_extra_move_desc(desc, mask); - - cpumask_copy(desc->affinity, mask); - - return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); -} - -static void -set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int dest; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - dest = set_desc_affinity(desc, mask); - if (dest != BAD_APICID) { - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, cfg); - } - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void -set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - set_ioapic_affinity_irq_desc(desc, mask); -} -#endif /* CONFIG_SMP */ - /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { struct irq_pin_list *entry; entry = cfg->irq_2_pin; if (!entry) { - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) { printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", apic, pin); @@ -663,7 +524,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) entry = entry->next; } - entry->next = get_one_free_irq_2_pin(cpu); + entry->next = get_one_free_irq_2_pin(node); entry = entry->next; entry->apic = apic; entry->pin = pin; @@ -672,7 +533,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, +static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, int oldapic, int oldpin, int newapic, int newpin) { @@ -692,7 +553,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, /* why? call replace before add? */ if (!replaced) - add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); + add_pin_to_irq_node(cfg, node, newapic, newpin); } static inline void io_apic_modify_irq(struct irq_cfg *cfg, @@ -850,7 +711,6 @@ static int __init ioapic_pirq_setup(char *str) __setup("pirq=", ioapic_pirq_setup); #endif /* CONFIG_X86_32 */ -#ifdef CONFIG_INTR_REMAP struct IO_APIC_route_entry **alloc_ioapic_entries(void) { int apic; @@ -948,20 +808,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) return 0; } -void reinit_intr_remapped_IO_APIC(int intr_remapping, - struct IO_APIC_route_entry **ioapic_entries) - -{ - /* - * for now plain restore of previous settings. - * TBD: In the case of OS enabling interrupt-remapping, - * IO-APIC RTE's need to be setup to point to interrupt-remapping - * table entries. for now, do a plain restore, and wait for - * the setup_IO_APIC_irqs() to do proper initialization. - */ - restore_IO_APIC_setup(ioapic_entries); -} - void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) { int apic; @@ -971,7 +817,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) kfree(ioapic_entries); } -#endif /* * Find the IRQ entry number of a certain pin. @@ -1032,54 +877,6 @@ static int __init find_isa_irq_apic(int irq, int type) return -1; } -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); - if (test_bit(bus, mp_bus_not_pci)) { - apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || - mp_irqs[i].dstapic == MP_APIC_ALL) - break; - - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} - -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); - #if defined(CONFIG_EISA) || defined(CONFIG_MCA) /* * EISA Edge/Level control register, ELCR @@ -1298,6 +1095,64 @@ static int pin_2_irq(int idx, int apic, int pin) return irq; } +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, + struct io_apic_irq_attr *irq_attr) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, + "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (test_bit(bus, mp_bus_not_pci)) { + apic_printk(APIC_VERBOSE, + "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || + mp_irqs[i].dstapic == MP_APIC_ALL) + break; + + if (!test_bit(lbus, mp_bus_not_pci) && + !mp_irqs[i].irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].srcbusirq & 3)) { + set_io_apic_irq_attr(irq_attr, apic, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); + return irq; + } + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) { + set_io_apic_irq_attr(irq_attr, apic, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); + best_guess = irq; + } + } + } + return best_guess; +} +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + void lock_vector_lock(void) { /* Used to the online set of cpus does not change @@ -1559,6 +1414,9 @@ int setup_ioapic_entry(int apic_id, int irq, irte.vector = vector; irte.dest_id = IRTE_DEST(destination); + /* Set source-id of interrupt request */ + set_ioapic_sid(&irte, apic_id); + modify_irte(irq, &irte); ir_entry->index2 = (index >> 15) & 0x1; @@ -1628,58 +1486,70 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq ioapic_write_entry(apic_id, pin, entry); } +static struct { + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} mp_ioapic_routing[MAX_IO_APICS]; + static void __init setup_IO_APIC_irqs(void) { - int apic_id, pin, idx, irq; + int apic_id = 0, pin, idx, irq; int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { - for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { - - idx = find_irq_entry(apic_id, pin, mp_INT); - if (idx == -1) { - if (!notcon) { - notcon = 1; - apic_printk(APIC_VERBOSE, - KERN_DEBUG " %d-%d", - mp_ioapics[apic_id].apicid, pin); - } else - apic_printk(APIC_VERBOSE, " %d-%d", - mp_ioapics[apic_id].apicid, pin); - continue; - } - if (notcon) { +#ifdef CONFIG_ACPI + if (!acpi_disabled && acpi_ioapic) { + apic_id = mp_find_ioapic(0); + if (apic_id < 0) + apic_id = 0; + } +#endif + + for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { + idx = find_irq_entry(apic_id, pin, mp_INT); + if (idx == -1) { + if (!notcon) { + notcon = 1; apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); - notcon = 0; - } + KERN_DEBUG " %d-%d", + mp_ioapics[apic_id].apicid, pin); + } else + apic_printk(APIC_VERBOSE, " %d-%d", + mp_ioapics[apic_id].apicid, pin); + continue; + } + if (notcon) { + apic_printk(APIC_VERBOSE, + " (apicid-pin) not connected\n"); + notcon = 0; + } - irq = pin_2_irq(idx, apic_id, pin); + irq = pin_2_irq(idx, apic_id, pin); - /* - * Skip the timer IRQ if there's a quirk handler - * installed and if it returns 1: - */ - if (apic->multi_timer_check && - apic->multi_timer_check(apic_id, irq)) - continue; - - desc = irq_to_desc_alloc_cpu(irq, cpu); - if (!desc) { - printk(KERN_INFO "can not get irq_desc for %d\n", irq); - continue; - } - cfg = desc->chip_data; - add_pin_to_irq_cpu(cfg, cpu, apic_id, pin); + /* + * Skip the timer IRQ if there's a quirk handler + * installed and if it returns 1: + */ + if (apic->multi_timer_check && + apic->multi_timer_check(apic_id, irq)) + continue; - setup_IO_APIC_irq(apic_id, pin, irq, desc, - irq_trigger(idx), irq_polarity(idx)); + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + continue; } + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, apic_id, pin); + /* + * don't mark it in pin_programmed, so later acpi could + * set it correctly when irq < 16 + */ + setup_IO_APIC_irq(apic_id, pin, irq, desc, + irq_trigger(idx), irq_polarity(idx)); } if (notcon) @@ -1846,36 +1716,30 @@ __apicdebuginit(void) print_IO_APIC(void) return; } -__apicdebuginit(void) print_APIC_bitfield(int base) +__apicdebuginit(void) print_APIC_field(int base) { - unsigned int v; - int i, j; + int i; if (apic_verbosity == APIC_QUIET) return; - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1<<j)) - printk("1"); - else - printk("0"); - } - printk("\n"); - } + printk(KERN_DEBUG); + + for (i = 0; i < 8; i++) + printk(KERN_CONT "%08x", apic_read(base + i*0x10)); + + printk(KERN_CONT "\n"); } __apicdebuginit(void) print_local_APIC(void *dummy) { - unsigned int v, ver, maxlvt; + unsigned int i, v, ver, maxlvt; u64 icr; if (apic_verbosity == APIC_QUIET) return; - printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", + printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); @@ -1916,11 +1780,11 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); printk(KERN_DEBUG "... APIC ISR field:\n"); - print_APIC_bitfield(APIC_ISR); + print_APIC_field(APIC_ISR); printk(KERN_DEBUG "... APIC TMR field:\n"); - print_APIC_bitfield(APIC_TMR); + print_APIC_field(APIC_TMR); printk(KERN_DEBUG "... APIC IRR field:\n"); - print_APIC_bitfield(APIC_IRR); + print_APIC_field(APIC_IRR); if (APIC_INTEGRATED(ver)) { /* !82489DX */ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ @@ -1957,6 +1821,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); v = apic_read(APIC_TDCR); printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + v = apic_read(APIC_EFEAT); + maxlvt = (v >> 16) & 0xff; + printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v); + v = apic_read(APIC_ECTRL); + printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v); + for (i = 0; i < maxlvt; i++) { + v = apic_read(APIC_EILVTn(i)); + printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); + } + } printk("\n"); } @@ -2005,6 +1881,11 @@ __apicdebuginit(void) print_PIC(void) __apicdebuginit(int) print_all_ICs(void) { print_PIC(); + + /* don't print out if apic is not there */ + if (!cpu_has_apic || disable_apic) + return 0; + print_all_local_APICs(); print_IO_APIC(); @@ -2120,7 +2001,9 @@ void disable_IO_APIC(void) /* * Use virtual wire A mode when interrupt remapping is enabled. */ - disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); + if (cpu_has_apic) + disconnect_bsp_APIC(!intr_remapping_enabled && + ioapic_i8259.pin != -1); } #ifdef CONFIG_X86_32 @@ -2360,6 +2243,118 @@ static int ioapic_retrigger_irq(unsigned int irq) */ #ifdef CONFIG_SMP +static void send_cleanup_vector(struct irq_cfg *cfg) +{ + cpumask_var_t cleanup_mask; + + if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { + unsigned int i; + cfg->move_cleanup_count = 0; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + cfg->move_cleanup_count++; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + } else { + cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cfg->move_cleanup_count = cpumask_weight(cleanup_mask); + apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + free_cpumask_var(cleanup_mask); + } + cfg->move_in_progress = 0; +} + +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + u8 vector = cfg->vector; + + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(irq)) + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = entry->next; + } +} + +static int +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); + +/* + * Either sets desc->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that, or returns BAD_APICID and + * leaves desc->affinity untouched. + */ +static unsigned int +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned int irq; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return BAD_APICID; + + irq = desc->irq; + cfg = desc->chip_data; + if (assign_irq_vector(irq, cfg, mask)) + return BAD_APICID; + + cpumask_copy(desc->affinity, mask); + + return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); +} + +static int +set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int dest; + unsigned int irq; + int ret = -1; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + dest = set_desc_affinity(desc, mask); + if (dest != BAD_APICID) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, cfg); + ret = 0; + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return ret; +} + +static int +set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + return set_ioapic_affinity_irq_desc(desc, mask); +} #ifdef CONFIG_INTR_REMAP @@ -2374,26 +2369,25 @@ static int ioapic_retrigger_irq(unsigned int irq) * Real vector that is used for interrupting cpu will be coming from * the interrupt-remapping table entry. */ -static void +static int migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; struct irte irte; unsigned int dest; unsigned int irq; + int ret = -1; if (!cpumask_intersects(mask, cpu_online_mask)) - return; + return ret; irq = desc->irq; if (get_irte(irq, &irte)) - return; + return ret; cfg = desc->chip_data; if (assign_irq_vector(irq, cfg, mask)) - return; - - set_extra_move_desc(desc, mask); + return ret; dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); @@ -2409,27 +2403,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) send_cleanup_vector(cfg); cpumask_copy(desc->affinity, mask); + + return 0; } /* * Migrates the IRQ destination in the process context. */ -static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { - migrate_ioapic_irq_desc(desc, mask); + return migrate_ioapic_irq_desc(desc, mask); } -static void set_ir_ioapic_affinity_irq(unsigned int irq, +static int set_ir_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); - set_ir_ioapic_affinity_irq_desc(desc, mask); + return set_ir_ioapic_affinity_irq_desc(desc, mask); } #else -static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { + return 0; } #endif @@ -2491,86 +2488,19 @@ static void irq_complete_move(struct irq_desc **descp) struct irq_cfg *cfg = desc->chip_data; unsigned vector, me; - if (likely(!cfg->move_in_progress)) { -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - if (likely(!cfg->move_desc_pending)) - return; - - /* domain has not changed, but affinity did */ - me = smp_processor_id(); - if (cpumask_test_cpu(me, desc->affinity)) { - *descp = desc = move_irq_desc(desc, me); - /* get the new one */ - cfg = desc->chip_data; - cfg->move_desc_pending = 0; - } -#endif + if (likely(!cfg->move_in_progress)) return; - } vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) { -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - *descp = desc = move_irq_desc(desc, me); - /* get the new one */ - cfg = desc->chip_data; -#endif + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) send_cleanup_vector(cfg); - } } #else static inline void irq_complete_move(struct irq_desc **descp) {} #endif -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - - entry = cfg->irq_2_pin; - for (;;) { - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - io_apic_eoi(apic, pin); - entry = entry->next; - } -} - -static void -eoi_ioapic_irq(struct irq_desc *desc) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#ifdef CONFIG_X86_X2APIC -static void ack_x2apic_level(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - ack_x2APIC_irq(); - eoi_ioapic_irq(desc); -} - -static void ack_x2apic_edge(unsigned int irq) -{ - ack_x2APIC_irq(); -} -#endif - static void ack_apic_edge(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2634,9 +2564,6 @@ static void ack_apic_level(unsigned int irq) */ ack_APIC_irq(); - if (irq_remapped(irq)) - eoi_ioapic_irq(desc); - /* Now we can move and renable the irq */ if (unlikely(do_unmask_irq)) { /* Only migrate the irq if the ack has been received. @@ -2683,22 +2610,50 @@ static void ack_apic_level(unsigned int irq) } #ifdef CONFIG_INTR_REMAP +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + + entry = cfg->irq_2_pin; + for (;;) { + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + io_apic_eoi(apic, pin); + entry = entry->next; + } +} + +static void +eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void ir_ack_apic_edge(unsigned int irq) { -#ifdef CONFIG_X86_X2APIC - if (x2apic_enabled()) - return ack_x2apic_edge(irq); -#endif - return ack_apic_edge(irq); + ack_APIC_irq(); } static void ir_ack_apic_level(unsigned int irq) { -#ifdef CONFIG_X86_X2APIC - if (x2apic_enabled()) - return ack_x2apic_level(irq); -#endif - return ack_apic_level(irq); + struct irq_desc *desc = irq_to_desc(irq); + + ack_APIC_irq(); + eoi_ioapic_irq(desc); } #endif /* CONFIG_INTR_REMAP */ @@ -2903,7 +2858,7 @@ static inline void __init check_timer(void) { struct irq_desc *desc = irq_to_desc(0); struct irq_cfg *cfg = desc->chip_data; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); int apic1, pin1, apic2, pin2; unsigned long flags; int no_pin1 = 0; @@ -2969,7 +2924,7 @@ static inline void __init check_timer(void) * Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); + add_pin_to_irq_node(cfg, node, apic1, pin1); setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } else { /* for edge trigger, setup_IO_APIC_irq already @@ -3006,7 +2961,7 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); + replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); enable_8259A_irq(0); if (timer_irq_works()) { @@ -3218,14 +3173,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY; /* * Dynamic irq allocate and deallocation */ -unsigned int create_irq_nr(unsigned int irq_want) +unsigned int create_irq_nr(unsigned int irq_want, int node) { /* Allocate an unused irq */ unsigned int irq; unsigned int new; unsigned long flags; struct irq_cfg *cfg_new = NULL; - int cpu = boot_cpu_id; struct irq_desc *desc_new = NULL; irq = 0; @@ -3234,7 +3188,7 @@ unsigned int create_irq_nr(unsigned int irq_want) spin_lock_irqsave(&vector_lock, flags); for (new = irq_want; new < nr_irqs; new++) { - desc_new = irq_to_desc_alloc_cpu(new, cpu); + desc_new = irq_to_desc_alloc_node(new, node); if (!desc_new) { printk(KERN_INFO "can not get irq_desc for %d\n", new); continue; @@ -3243,6 +3197,9 @@ unsigned int create_irq_nr(unsigned int irq_want) if (cfg_new->vector != 0) continue; + + desc_new = move_irq_desc(desc_new, node); + if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; break; @@ -3260,11 +3217,12 @@ unsigned int create_irq_nr(unsigned int irq_want) int create_irq(void) { + int node = cpu_to_node(boot_cpu_id); unsigned int irq_want; int irq; irq_want = nr_irqs_gsi; - irq = create_irq_nr(irq_want); + irq = create_irq_nr(irq_want, node); if (irq == 0) irq = -1; @@ -3329,6 +3287,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); + /* Set source-id of interrupt request */ + set_msi_sid(&irte, pdev); + modify_irte(irq, &irte); msg->address_hi = MSI_ADDR_BASE_HI; @@ -3366,7 +3327,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms } #ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3375,7 +3336,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3387,13 +3348,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg_desc(desc, &msg); + + return 0; } #ifdef CONFIG_INTR_REMAP /* * Migrate the MSI irq to another cpumask. This migration is * done in the process context using interrupt-remapping hardware. */ -static void +static int ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); @@ -3402,11 +3365,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) struct irte irte; if (get_irte(irq, &irte)) - return; + return -1; dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@ -3423,6 +3386,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) */ if (cfg->move_in_progress) send_cleanup_vector(cfg); + + return 0; } #endif @@ -3518,15 +3483,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) unsigned int irq_want; struct intel_iommu *iommu = NULL; int index = 0; + int node; /* x86 doesn't support multiple MSI yet */ if (type == PCI_CAP_ID_MSI && nvec > 1) return 1; + node = dev_to_node(&dev->dev); irq_want = nr_irqs_gsi; sub_handle = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want); + irq = create_irq_nr(irq_want, node); if (irq == 0) return -1; irq_want = irq + 1; @@ -3576,7 +3543,7 @@ void arch_teardown_msi_irq(unsigned int irq) #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) #ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3585,7 +3552,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3597,11 +3564,13 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); + + return 0; } #endif /* CONFIG_SMP */ -struct irq_chip dmar_msi_type = { +static struct irq_chip dmar_msi_type = { .name = "DMAR_MSI", .unmask = dmar_msi_unmask, .mask = dmar_msi_mask, @@ -3630,7 +3599,7 @@ int arch_setup_dmar_msi(unsigned int irq) #ifdef CONFIG_HPET_TIMER #ifdef CONFIG_SMP -static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3639,7 +3608,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3651,6 +3620,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); hpet_msi_write(irq, &msg); + + return 0; } #endif /* CONFIG_SMP */ @@ -3707,7 +3678,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) write_ht_irq_msg(irq, &msg); } -static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3715,11 +3686,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; target_ht_irq(irq, dest, cfg->vector); + + return 0; } #endif @@ -3794,6 +3767,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long flags; int err; + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + cfg = irq_cfg(irq); err = assign_irq_vector(irq, cfg, eligible_cpu); @@ -3807,19 +3782,20 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + return irq; } @@ -3833,10 +3809,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) struct uv_IO_APIC_route_entry *entry; int mmr_pnode; + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - entry->mask = 1; mmr_pnode = uv_blade_to_pnode(mmr_blade); @@ -3900,6 +3876,71 @@ int __init arch_probe_nr_irqs(void) } #endif +static int __io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) +{ + struct irq_desc *desc; + struct irq_cfg *cfg; + int node; + int ioapic, pin; + int trigger, polarity; + + ioapic = irq_attr->ioapic; + if (!IO_APIC_IRQ(irq)) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + if (dev) + node = dev_to_node(dev); + else + node = cpu_to_node(boot_cpu_id); + + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc %d\n", irq); + return 0; + } + + pin = irq_attr->ioapic_pin; + trigger = irq_attr->trigger; + polarity = irq_attr->polarity; + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= NR_IRQS_LEGACY) { + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, ioapic, pin); + } + + setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); + + return 0; +} + +int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) +{ + int ioapic, pin; + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + ioapic = irq_attr->ioapic; + pin = irq_attr->ioapic_pin; + if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[ioapic].apicid, pin); + return 0; + } + set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); + + return __io_apic_set_pci_routing(dev, irq, irq_attr); +} + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ @@ -3980,6 +4021,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) return apic_id; } +#endif int __init io_apic_get_version(int ioapic) { @@ -3992,39 +4034,6 @@ int __init io_apic_get_version(int ioapic) return reg_01.bits.version; } -#endif - -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) -{ - struct irq_desc *desc; - struct irq_cfg *cfg; - int cpu = boot_cpu_id; - - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - desc = irq_to_desc_alloc_cpu(irq, cpu); - if (!desc) { - printk(KERN_INFO "can not get irq_desc %d\n", irq); - return 0; - } - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= NR_IRQS_LEGACY) { - cfg = desc->chip_data; - add_pin_to_irq_cpu(cfg, cpu, ioapic, pin); - } - - setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); - - return 0; -} - int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) { @@ -4055,51 +4064,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) #ifdef CONFIG_SMP void __init setup_ioapic_dest(void) { - int pin, ioapic, irq, irq_entry; + int pin, ioapic = 0, irq, irq_entry; struct irq_desc *desc; - struct irq_cfg *cfg; const struct cpumask *mask; if (skip_ioapic_setup == 1) return; - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - - /* setup_IO_APIC_irqs could fail to get vector for some device - * when you have too many devices, because at that time only boot - * cpu is online. - */ - desc = irq_to_desc(irq); - cfg = desc->chip_data; - if (!cfg->vector) { - setup_IO_APIC_irq(ioapic, pin, irq, desc, - irq_trigger(irq_entry), - irq_polarity(irq_entry)); - continue; +#ifdef CONFIG_ACPI + if (!acpi_disabled && acpi_ioapic) { + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + ioapic = 0; + } +#endif - } + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); - /* - * Honour affinities which have been set in early boot - */ - if (desc->status & - (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->affinity; - else - mask = apic->target_cpus(); + desc = irq_to_desc(irq); - if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq_desc(desc, mask); - else - set_ioapic_affinity_irq_desc(desc, mask); - } + /* + * Honour affinities which have been set in early boot + */ + if (desc->status & + (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) + mask = desc->affinity; + else + mask = apic->target_cpus(); + if (intr_remapping_enabled) + set_ir_ioapic_affinity_irq_desc(desc, mask); + else + set_ioapic_affinity_irq_desc(desc, mask); } + } #endif @@ -4182,28 +4184,20 @@ fake_ioapic_page: } } -static int __init ioapic_insert_resources(void) +void __init ioapic_insert_resources(void) { int i; struct resource *r = ioapic_resources; if (!r) { - if (nr_ioapics > 0) { + if (nr_ioapics > 0) printk(KERN_ERR "IO APIC resources couldn't be allocated.\n"); - return -1; - } - return 0; + return; } for (i = 0; i < nr_ioapics; i++) { insert_resource(&iomem_resource, r); r++; } - - return 0; } - -/* Insert the IO APIC resources after PCI initialization has occured to handle - * IO APICS that are mapped in on a BAR in PCI space. */ -late_initcall(ioapic_insert_resources); diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index dbf5445727a..6ef00ba4c88 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -106,6 +106,9 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) unsigned long mask = cpumask_bits(cpumask)[0]; unsigned long flags; + if (WARN_ONCE(!mask, "empty IPI mask")) + return; + local_irq_save(flags); WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); __default_send_IPI_dest_field(mask, vector, apic->dest_logical); diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index ce4fbfa315a..b3025b43b63 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu) static inline int mce_in_progress(void) { -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) +#if defined(CONFIG_X86_NEW_MCE) return atomic_read(&mce_entry) > 0; #endif return 0; @@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data) } #endif -static void report_broken_nmi(int cpu, int *prev_nmi_count) +static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count) { printk(KERN_CONT "\n"); diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 533e59c6fc8..ca96e68f0d2 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -493,7 +493,8 @@ static void numaq_setup_portio_remap(void) (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); } -struct apic apic_numaq = { +/* Use __refdata to keep false positive warning calm. */ +struct apic __refdata apic_numaq = { .name = "NUMAQ", .probe = probe_numaq, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 01eda2ac65e..0c0182cc947 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -20,23 +20,12 @@ #include <asm/apic.h> #include <asm/setup.h> -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <asm/mpspec.h> -#include <asm/fixmap.h> -#include <asm/apicdef.h> -#include <linux/kernel.h> -#include <linux/string.h> #include <linux/smp.h> -#include <linux/init.h> #include <asm/ipi.h> -#include <linux/smp.h> -#include <linux/init.h> #include <linux/interrupt.h> #include <asm/acpi.h> #include <asm/e820.h> -#include <asm/setup.h> #ifdef CONFIG_HOTPLUG_CPU #define DEFAULT_SEND_IPI (1) @@ -160,7 +149,6 @@ extern struct apic apic_summit; extern struct apic apic_bigsmp; extern struct apic apic_es7000; extern struct apic apic_es7000_cluster; -extern struct apic apic_default; struct apic *apic = &apic_default; EXPORT_SYMBOL_GPL(apic); diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 1783652bb0e..bc3e880f9b8 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = { void __init default_setup_apic_routing(void) { #ifdef CONFIG_X86_X2APIC - if (x2apic && (apic != &apic_x2apic_phys && + if (x2apic_mode && (apic != &apic_x2apic_phys && #ifdef CONFIG_X86_UV apic != &apic_x2apic_uv_x && #endif diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 9cfe1f415d8..eafdfbd1ea9 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -44,7 +44,6 @@ #include <asm/ipi.h> #include <linux/kernel.h> #include <linux/string.h> -#include <linux/init.h> #include <linux/gfp.h> #include <linux/smp.h> @@ -173,13 +172,6 @@ static inline int is_WPEG(struct rio_detail *rio){ rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); } - -/* In clustered mode, the high nibble of APIC ID is a cluster number. - * The low nibble is a 4-bit bitmap. */ -#define XAPIC_DEST_CPUS_SHIFT 4 -#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1) -#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT) - #define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) static const struct cpumask *summit_target_cpus(void) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 4a903e2f0d1..a5371ec3677 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -10,18 +10,20 @@ #include <asm/apic.h> #include <asm/ipi.h> -DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return x2apic_enabled(); } -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - +/* + * need to use more than cpu 0, because we need more vectors when + * MSI-X are used. + */ static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } /* @@ -170,7 +172,7 @@ static unsigned long set_apic_id(unsigned int id) static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) { - return current_cpu_data.initial_apicid >> index_msb; + return initial_apicid >> index_msb; } static void x2apic_send_IPI_self(int vector) diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index a284359627e..a8989aadc99 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -27,11 +27,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - +/* + * need to use more than cpu 0, because we need more vectors when + * MSI-X are used. + */ static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) @@ -162,7 +164,7 @@ static unsigned long set_apic_id(unsigned int id) static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) { - return current_cpu_data.initial_apicid >> index_msb; + return initial_apicid >> index_msb; } static void x2apic_send_IPI_self(int vector) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 2bda6935297..601159374e8 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -46,7 +46,7 @@ static int early_get_nodeid(void) return node_id.s.node_id; } -static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strcmp(oem_id, "SGI")) { if (!strcmp(oem_table_id, "UVL")) @@ -105,7 +105,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) cpumask_set_cpu(cpu, retmask); } -static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) { #ifdef CONFIG_SMP unsigned long val; @@ -253,7 +253,7 @@ static void uv_send_IPI_self(int vector) apic_write(APIC_SELF_IPI, vector); } -struct apic apic_x2apic_uv_x = { +struct apic __refdata apic_x2apic_uv_x = { .name = "UV large system", .probe = NULL, @@ -261,7 +261,7 @@ struct apic apic_x2apic_uv_x = { .apic_id_registered = uv_apic_id_registered, .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 1, /* logical */ + .irq_dest_mode = 0, /* physical */ .target_cpus = uv_target_cpus, .disable_esr = 0, @@ -362,12 +362,6 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) BUG(); } -static __init void map_low_mmrs(void) -{ - init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); - init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); -} - enum map_type {map_wb, map_uc}; static __init void map_high(char *id, unsigned long base, int shift, @@ -395,26 +389,6 @@ static __init void map_gru_high(int max_pnode) map_high("GRU", gru.s.base, shift, max_pnode, map_wb); } -static __init void map_config_high(int max_pnode) -{ - union uvh_rh_gam_cfg_overlay_config_mmr_u cfg; - int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT; - - cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); - if (cfg.s.enable) - map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc); -} - -static __init void map_mmr_high(int max_pnode) -{ - union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; - int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; - - mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); - if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); -} - static __init void map_mmioh_high(int max_pnode) { union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; @@ -463,7 +437,7 @@ static void uv_heartbeat(unsigned long ignored) uv_set_scir_bits(bits); /* enable next timer period */ - mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); + mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL); } static void __cpuinit uv_heartbeat_enable(int cpu) @@ -562,18 +536,23 @@ void __init uv_system_init(void) union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; - int max_pnode = 0; + int gnode_extra, max_pnode = 0; unsigned long mmr_base, present, paddr; unsigned short pnode_mask; - map_low_mmrs(); - m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; + pnode_mask = (1 << n_val) - 1; + node_id.v = uv_read_local_mmr(UVH_NODE_ID); + gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; + gnode_upper = ((unsigned long)gnode_extra << m_val); + printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", + n_val, m_val, gnode_upper, gnode_extra); + printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) @@ -583,15 +562,20 @@ void __init uv_system_init(void) bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); uv_blade_info = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_blade_info); + for (blade = 0; blade < uv_num_possible_blades(); blade++) + uv_blade_info[blade].memory_nid = -1; get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_node_to_blade); memset(uv_node_to_blade, 255, bytes); bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_cpu_to_blade); memset(uv_cpu_to_blade, 255, bytes); blade = 0; @@ -607,11 +591,6 @@ void __init uv_system_init(void) } } - pnode_mask = (1 << n_val) - 1; - node_id.v = uv_read_local_mmr(UVH_NODE_ID); - gnode_upper = (((unsigned long)node_id.s.node_id) & - ~((1 << n_val) - 1)) << m_val; - uv_bios_init(); uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size); @@ -624,6 +603,9 @@ void __init uv_system_init(void) lcpu = uv_blade_info[blade].nr_possible_cpus; uv_blade_info[blade].nr_possible_cpus++; + /* Any node on the blade, else will contain -1. */ + uv_blade_info[blade].memory_nid = nid; + uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; @@ -634,6 +616,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; + uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; @@ -656,11 +639,10 @@ void __init uv_system_init(void) pnode = (paddr >> m_val) & pnode_mask; blade = boot_pnode_to_blade(pnode); uv_node_to_blade[nid] = blade; + max_pnode = max(pnode, max_pnode); } map_gru_high(max_pnode); - map_mmr_high(max_pnode); - map_config_high(max_pnode); map_mmioh_high(max_pnode); uv_cpu_init(); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 49e0939bac4..442b5508893 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -811,7 +811,7 @@ static int apm_do_idle(void) u8 ret = 0; int idled = 0; int polling; - int err; + int err = 0; polling = !!(current_thread_info()->status & TS_POLLING); if (polling) { @@ -1233,9 +1233,9 @@ static int suspend(int vetoable) int err; struct apm_user *as; - device_suspend(PMSG_SUSPEND); + dpm_suspend_start(PMSG_SUSPEND); - device_power_down(PMSG_SUSPEND); + dpm_suspend_noirq(PMSG_SUSPEND); local_irq_disable(); sysdev_suspend(PMSG_SUSPEND); @@ -1259,9 +1259,9 @@ static int suspend(int vetoable) sysdev_resume(); local_irq_enable(); - device_power_up(PMSG_RESUME); + dpm_resume_noirq(PMSG_RESUME); - device_resume(PMSG_RESUME); + dpm_resume_end(PMSG_RESUME); queue_event(APM_NORMAL_RESUME, NULL); spin_lock(&user_list_lock); for (as = user_list; as != NULL; as = as->next) { @@ -1277,7 +1277,7 @@ static void standby(void) { int err; - device_power_down(PMSG_SUSPEND); + dpm_suspend_noirq(PMSG_SUSPEND); local_irq_disable(); sysdev_suspend(PMSG_SUSPEND); @@ -1291,7 +1291,7 @@ static void standby(void) sysdev_resume(); local_irq_enable(); - device_power_up(PMSG_RESUME); + dpm_resume_noirq(PMSG_RESUME); } static apm_event_t get_event(void) @@ -1376,7 +1376,7 @@ static void check_events(void) ignore_bounce = 1; if ((event != APM_NORMAL_RESUME) || (ignore_normal_resume == 0)) { - device_resume(PMSG_RESUME); + dpm_resume_end(PMSG_RESUME); queue_event(event, NULL); } ignore_normal_resume = 0; diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 5a6aa1c1162..dfdbf640389 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -126,6 +126,7 @@ void foo(void) #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); + OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); BLANK(); @@ -146,4 +147,5 @@ void foo(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e72f062fb4b..898ecc47e12 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -125,6 +125,7 @@ int main(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); BLANK(); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4e242f9a06e..c1f253dac15 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -1,5 +1,5 @@ # -# Makefile for x86-compatible CPU details and quirks +# Makefile for x86-compatible CPU details, features and quirks # # Don't trace early stages of a secondary CPU boot @@ -7,6 +7,10 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_common.o = -pg endif +# Make sure load_percpu_segment has no stackprotector +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_common.o := $(nostackp) + obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o @@ -23,11 +27,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_X86_MCE) += mcheck/ -obj-$(CONFIG_MTRR) += mtrr/ -obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o + +obj-$(CONFIG_X86_MCE) += mcheck/ +obj-$(CONFIG_MTRR) += mtrr/ +obj-$(CONFIG_CPU_FREQ) += cpufreq/ -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7e4a459daa6..63fddcd082c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -6,6 +6,7 @@ #include <asm/processor.h> #include <asm/apic.h> #include <asm/cpu.h> +#include <asm/pci-direct.h> #ifdef CONFIG_X86_64 # include <asm/numa_64.h> @@ -257,13 +258,15 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT unsigned bits; + int cpu = smp_processor_id(); bits = c->x86_coreid_bits; - /* Low order bits define the core id (index of core in socket) */ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); /* Convert the initial APIC ID into the socket ID */ c->phys_proc_id = c->initial_apicid >> bits; + /* use socket ID also for last level cache */ + per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; #endif } @@ -272,7 +275,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) int cpu = smp_processor_id(); int node; - unsigned apicid = hard_smp_processor_id(); + unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; node = c->phys_proc_id; if (apicid_to_node[apicid] != NUMA_NO_NODE) @@ -351,6 +354,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) (c->x86_model == 8 && c->x86_mask >= 8)) set_cpu_cap(c, X86_FEATURE_K6_MTRR); #endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) + /* check CPU config space for extended APIC ID */ + if (cpu_has_apic && c->x86 >= 0xf) { + unsigned int val; + val = read_pci_config(0, 24, 0, 0x68); + if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + } +#endif } static void __cpuinit init_amd(struct cpuinfo_x86 *c) @@ -388,6 +400,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) level = cpuid_eax(1); if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* + * Some BIOSes incorrectly force this feature, but only K8 + * revision D (model = 0x14) and later actually support it. + */ + if (c->x86_model < 0x14) + clear_cpu_cap(c, X86_FEATURE_LAHF_LM); } if (c->x86 == 0x10 || c->x86 == 0x11) set_cpu_cap(c, X86_FEATURE_REP_GOOD); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 77848d9fca6..5ce60a88027 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,6 +13,7 @@ #include <linux/io.h> #include <asm/stackprotector.h> +#include <asm/perf_counter.h> #include <asm/mmu_context.h> #include <asm/hypervisor.h> #include <asm/processor.h> @@ -58,7 +59,30 @@ void __init setup_cpu_local_masks(void) alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); } -static const struct cpu_dev *this_cpu __cpuinitdata; +static void __cpuinit default_init(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + display_cacheinfo(c); +#else + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +#endif +} + +static const struct cpu_dev __cpuinitconst default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, +}; + +static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 @@ -107,7 +131,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { /* data */ [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, - [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, + [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, GDT_STACK_CANARY_INIT #endif @@ -299,7 +323,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_set[NCAPINTS] __cpuinitdata; void load_percpu_segment(int cpu) { @@ -330,29 +355,6 @@ void switch_to_new_gdt(int cpu) static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; -static void __cpuinit default_init(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_64 - display_cacheinfo(c); -#else - /* Not much we can do here... */ - /* Check if at least it has cpuid */ - if (c->cpuid_level == -1) { - /* No cpuid. It must be an ancient CPU */ - if (c->x86 == 4) - strcpy(c->x86_model_id, "486"); - else if (c->x86 == 3) - strcpy(c->x86_model_id, "386"); - } -#endif -} - -static const struct cpu_dev __cpuinitconst default_cpu = { - .c_init = default_init, - .c_vendor = "Unknown", - .c_x86_vendor = X86_VENDOR_UNKNOWN, -}; - static void __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; @@ -485,7 +487,6 @@ out: static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; - static int printed; int i; for (i = 0; i < X86_VENDOR_NUM; i++) { @@ -502,13 +503,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) } } - if (!printed) { - printed++; - printk(KERN_ERR - "CPU: vendor_id '%s' unknown, using generic init.\n", v); - - printk(KERN_ERR "CPU: Your system may be unstable.\n"); - } + printk_once(KERN_ERR + "CPU: vendor_id '%s' unknown, using generic init.\n" \ + "CPU: Your system may be unstable.\n", v); c->x86_vendor = X86_VENDOR_UNKNOWN; this_cpu = &default_cpu; @@ -768,6 +765,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_identify) this_cpu->c_identify(c); + /* Clear/Set all flags overriden by options, after probe */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + #ifdef CONFIG_X86_64 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); #endif @@ -813,6 +816,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #endif init_hypervisor(c); + + /* + * Clear/Set all flags overriden by options, need do it + * before following smp all cpus cap AND. + */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -825,10 +838,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } - /* Clear all flags overriden by options */ - for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] &= ~cleared_cpu_caps[i]; - #ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ mcheck_init(c); @@ -861,6 +870,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif + init_hw_perf_counters(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 46e29ab96c6..6b2a52dd040 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -32,9 +32,7 @@ static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); -static DEFINE_PER_CPU(unsigned, cpu_modelflag); static DEFINE_PER_CPU(int, cpu_priv_count); -static DEFINE_PER_CPU(unsigned, cpu_model); static DEFINE_MUTEX(cpu_debug_lock); @@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = { { "value", CPU_REG_ALL, 1 }, }; -/* Intel Registers Range */ -static struct cpu_debug_range cpu_intel_range[] = { - { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL }, - { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE }, - { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL }, - { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM }, - { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE }, - { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE }, - - { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE }, - { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON }, - { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON }, - { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE }, - - { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE }, - { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT }, - { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT }, - { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM }, - - { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE }, - { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 }, - { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE }, - { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON }, - - { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT }, - { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT }, - { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT }, - { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE }, - - { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 }, - { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 }, - { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX }, - { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 }, - { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT }, - - { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE }, - { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE }, - { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE }, - { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT }, - { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE }, - { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE }, - { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE }, - { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE }, - - { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT }, - { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE }, - { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE }, - { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 }, - { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE }, - { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 }, - - { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE }, - { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE }, - - { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON }, - { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE }, - { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON }, - { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT }, - { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON }, - { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT }, - { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE }, - { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON }, - - { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON }, - { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON }, - { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON }, - { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE }, - { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON }, - { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE }, - { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE }, - - { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE }, - { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON }, - - { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON }, - { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON }, - { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON }, - { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON }, +/* CPU Registers Range */ +static struct cpu_debug_range cpu_reg_range[] = { + { 0x00000000, 0x00000001, CPU_MC, }, + { 0x00000006, 0x00000007, CPU_MONITOR, }, + { 0x00000010, 0x00000010, CPU_TIME, }, + { 0x00000011, 0x00000013, CPU_PMC, }, + { 0x00000017, 0x00000017, CPU_PLATFORM, }, + { 0x0000001B, 0x0000001B, CPU_APIC, }, + { 0x0000002A, 0x0000002B, CPU_POWERON, }, + { 0x0000002C, 0x0000002C, CPU_FREQ, }, + { 0x0000003A, 0x0000003A, CPU_CONTROL, }, + { 0x00000040, 0x00000047, CPU_LBRANCH, }, + { 0x00000060, 0x00000067, CPU_LBRANCH, }, + { 0x00000079, 0x00000079, CPU_BIOS, }, + { 0x00000088, 0x0000008A, CPU_CACHE, }, + { 0x0000008B, 0x0000008B, CPU_BIOS, }, + { 0x0000009B, 0x0000009B, CPU_MONITOR, }, + { 0x000000C1, 0x000000C4, CPU_PMC, }, + { 0x000000CD, 0x000000CD, CPU_FREQ, }, + { 0x000000E7, 0x000000E8, CPU_PERF, }, + { 0x000000FE, 0x000000FE, CPU_MTRR, }, + + { 0x00000116, 0x0000011E, CPU_CACHE, }, + { 0x00000174, 0x00000176, CPU_SYSENTER, }, + { 0x00000179, 0x0000017B, CPU_MC, }, + { 0x00000186, 0x00000189, CPU_PMC, }, + { 0x00000198, 0x00000199, CPU_PERF, }, + { 0x0000019A, 0x0000019A, CPU_TIME, }, + { 0x0000019B, 0x0000019D, CPU_THERM, }, + { 0x000001A0, 0x000001A0, CPU_MISC, }, + { 0x000001C9, 0x000001C9, CPU_LBRANCH, }, + { 0x000001D7, 0x000001D8, CPU_LBRANCH, }, + { 0x000001D9, 0x000001D9, CPU_DEBUG, }, + { 0x000001DA, 0x000001E0, CPU_LBRANCH, }, + + { 0x00000200, 0x0000020F, CPU_MTRR, }, + { 0x00000250, 0x00000250, CPU_MTRR, }, + { 0x00000258, 0x00000259, CPU_MTRR, }, + { 0x00000268, 0x0000026F, CPU_MTRR, }, + { 0x00000277, 0x00000277, CPU_PAT, }, + { 0x000002FF, 0x000002FF, CPU_MTRR, }, + + { 0x00000300, 0x00000311, CPU_PMC, }, + { 0x00000345, 0x00000345, CPU_PMC, }, + { 0x00000360, 0x00000371, CPU_PMC, }, + { 0x0000038D, 0x00000390, CPU_PMC, }, + { 0x000003A0, 0x000003BE, CPU_PMC, }, + { 0x000003C0, 0x000003CD, CPU_PMC, }, + { 0x000003E0, 0x000003E1, CPU_PMC, }, + { 0x000003F0, 0x000003F2, CPU_PMC, }, + + { 0x00000400, 0x00000417, CPU_MC, }, + { 0x00000480, 0x0000048B, CPU_VMX, }, + + { 0x00000600, 0x00000600, CPU_DEBUG, }, + { 0x00000680, 0x0000068F, CPU_LBRANCH, }, + { 0x000006C0, 0x000006CF, CPU_LBRANCH, }, + + { 0x000107CC, 0x000107D3, CPU_PMC, }, + + { 0xC0000080, 0xC0000080, CPU_FEATURES, }, + { 0xC0000081, 0xC0000084, CPU_CALL, }, + { 0xC0000100, 0xC0000102, CPU_BASE, }, + { 0xC0000103, 0xC0000103, CPU_TIME, }, + + { 0xC0010000, 0xC0010007, CPU_PMC, }, + { 0xC0010010, 0xC0010010, CPU_CONF, }, + { 0xC0010015, 0xC0010015, CPU_CONF, }, + { 0xC0010016, 0xC001001A, CPU_MTRR, }, + { 0xC001001D, 0xC001001D, CPU_MTRR, }, + { 0xC001001F, 0xC001001F, CPU_CONF, }, + { 0xC0010030, 0xC0010035, CPU_BIOS, }, + { 0xC0010044, 0xC0010048, CPU_MC, }, + { 0xC0010050, 0xC0010056, CPU_SMM, }, + { 0xC0010058, 0xC0010058, CPU_CONF, }, + { 0xC0010060, 0xC0010060, CPU_CACHE, }, + { 0xC0010061, 0xC0010068, CPU_SMM, }, + { 0xC0010069, 0xC001006B, CPU_SMM, }, + { 0xC0010070, 0xC0010071, CPU_SMM, }, + { 0xC0010111, 0xC0010113, CPU_SMM, }, + { 0xC0010114, 0xC0010118, CPU_SVM, }, + { 0xC0010140, 0xC0010141, CPU_OSVM, }, + { 0xC0011022, 0xC0011023, CPU_CONF, }, }; -/* AMD Registers Range */ -static struct cpu_debug_range cpu_amd_range[] = { - { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, }, - { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, }, - { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, }, - { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS }, - { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS }, - { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, }, - - { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, }, - { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, }, - { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, }, - - { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, }, - { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, }, - - { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, }, - { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, }, - { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, }, - { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, }, - - { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, }, - { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, }, - { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, }, - { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, }, - { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, }, - { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, }, - { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, }, - { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, }, - { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, }, - { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, }, - { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, }, - { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, }, - { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, }, - { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, }, - { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, }, - { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, }, - { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, }, - { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, }, -}; - - -/* Intel */ -static int get_intel_modelflag(unsigned model) -{ - int flag; - - switch (model) { - case 0x0501: - case 0x0502: - case 0x0504: - flag = CPU_INTEL_PENTIUM; - break; - case 0x0601: - case 0x0603: - case 0x0605: - case 0x0607: - case 0x0608: - case 0x060A: - case 0x060B: - flag = CPU_INTEL_P6; - break; - case 0x0609: - case 0x060D: - flag = CPU_INTEL_PENTIUM_M; - break; - case 0x060E: - flag = CPU_INTEL_CORE; - break; - case 0x060F: - case 0x0617: - flag = CPU_INTEL_CORE2; - break; - case 0x061C: - flag = CPU_INTEL_ATOM; - break; - case 0x0F00: - case 0x0F01: - case 0x0F02: - case 0x0F03: - case 0x0F04: - flag = CPU_INTEL_XEON_P4; - break; - case 0x0F06: - flag = CPU_INTEL_XEON_MP; - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -/* AMD */ -static int get_amd_modelflag(unsigned model) -{ - int flag; - - switch (model >> 8) { - case 0x6: - flag = CPU_AMD_K6; - break; - case 0x7: - flag = CPU_AMD_K7; - break; - case 0x8: - flag = CPU_AMD_K8; - break; - case 0xf: - flag = CPU_AMD_0F; - break; - case 0x10: - flag = CPU_AMD_10; - break; - case 0x11: - flag = CPU_AMD_11; - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -static int get_cpu_modelflag(unsigned cpu) -{ - int flag; - - flag = per_cpu(cpu_model, cpu); - - switch (flag >> 16) { - case X86_VENDOR_INTEL: - flag = get_intel_modelflag(flag); - break; - case X86_VENDOR_AMD: - flag = get_amd_modelflag(flag & 0xffff); - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -static int get_cpu_range_count(unsigned cpu) -{ - int index; - - switch (per_cpu(cpu_model, cpu) >> 16) { - case X86_VENDOR_INTEL: - index = ARRAY_SIZE(cpu_intel_range); - break; - case X86_VENDOR_AMD: - index = ARRAY_SIZE(cpu_amd_range); - break; - default: - index = 0; - break; - } - - return index; -} - static int is_typeflag_valid(unsigned cpu, unsigned flag) { - unsigned vendor, modelflag; - int i, index; + int i; /* Standard Registers should be always valid */ if (flag >= CPU_TSS) return 1; - modelflag = per_cpu(cpu_modelflag, cpu); - vendor = per_cpu(cpu_model, cpu) >> 16; - index = get_cpu_range_count(cpu); - - for (i = 0; i < index; i++) { - switch (vendor) { - case X86_VENDOR_INTEL: - if ((cpu_intel_range[i].model & modelflag) && - (cpu_intel_range[i].flag & flag)) - return 1; - break; - case X86_VENDOR_AMD: - if ((cpu_amd_range[i].model & modelflag) && - (cpu_amd_range[i].flag & flag)) - return 1; - break; - } + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { + if (cpu_reg_range[i].flag == flag) + return 1; } /* Invalid */ @@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag) static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, int index, unsigned flag) { - unsigned modelflag; - - modelflag = per_cpu(cpu_modelflag, cpu); - *max = 0; - switch (per_cpu(cpu_model, cpu) >> 16) { - case X86_VENDOR_INTEL: - if ((cpu_intel_range[index].model & modelflag) && - (cpu_intel_range[index].flag & flag)) { - *min = cpu_intel_range[index].min; - *max = cpu_intel_range[index].max; - } - break; - case X86_VENDOR_AMD: - if ((cpu_amd_range[index].model & modelflag) && - (cpu_amd_range[index].flag & flag)) { - *min = cpu_amd_range[index].min; - *max = cpu_amd_range[index].max; - } - break; - } + if (cpu_reg_range[index].flag == flag) { + *min = cpu_reg_range[index].min; + *max = cpu_reg_range[index].max; + } else + *max = 0; return *max; } @@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) unsigned msr, msr_min, msr_max; struct cpu_private *priv; u32 low, high; - int i, range; + int i; if (seq) { priv = seq->private; @@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) } } - range = get_cpu_range_count(cpu); - - for (i = 0; i < range; i++) { + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) continue; @@ -588,8 +369,20 @@ static void print_apic(void *arg) seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); -#endif /* CONFIG_X86_LOCAL_APIC */ + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + unsigned int i, v, maxeilvt; + + v = apic_read(APIC_EFEAT); + maxeilvt = (v >> 16) & 0xff; + seq_printf(seq, " EFEAT\t\t: %08x\n", v); + seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL)); + for (i = 0; i < maxeilvt; i++) { + v = apic_read(APIC_EILVTn(i)); + seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v); + } + } +#endif /* CONFIG_X86_LOCAL_APIC */ seq_printf(seq, "\n MSR\t:\n"); } @@ -788,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry) { struct dentry *cpu_dentry = NULL; unsigned reg, reg_min, reg_max; - int i, range, err = 0; + int i, err = 0; char reg_dir[12]; u32 low, high; - range = get_cpu_range_count(cpu); - - for (i = 0; i < range; i++) { + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { if (!get_cpu_range(cpu, ®_min, ®_max, i, cpu_base[type].flag)) continue; @@ -850,10 +641,6 @@ static int cpu_init_cpu(void) cpui = &cpu_data(cpu); if (!cpu_has(cpui, X86_FEATURE_MSR)) continue; - per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) | - (cpui->x86 << 8) | - (cpui->x86_model)); - per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu); sprintf(cpu_dir, "cpu%d", cpu); cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index 52c83987547..f138c6c389b 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -220,11 +220,14 @@ config X86_LONGHAUL If in doubt, say N. config X86_E_POWERSAVER - tristate "VIA C7 Enhanced PowerSaver" + tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)" select CPU_FREQ_TABLE - depends on X86_32 + depends on X86_32 && EXPERIMENTAL help - This adds the CPUFreq driver for VIA C7 processors. + This adds the CPUFreq driver for VIA C7 processors. However, this driver + does not have any safeguards to prevent operating the CPU out of spec + and is thus considered dangerous. Please use the regular ACPI cpufreq + driver, enabled by CONFIG_X86_ACPI_CPUFREQ. If in doubt, say N. diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 752e8c6b2c7..ae9b503220c 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid) { struct cpuinfo_x86 *cpu = &cpu_data(cpuid); - if (cpu->x86_vendor != X86_VENDOR_INTEL || - !cpu_has(cpu, X86_FEATURE_EST)) - return 0; - - return 1; + return cpu_has(cpu, X86_FEATURE_EST); } static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index cf52215d9eb..2a50ef89100 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1,3 +1,4 @@ + /* * (c) 2003-2006 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the @@ -117,20 +118,17 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data) u32 i = 0; if (cpu_family == CPU_HW_PSTATE) { - if (data->currpstate == HW_PSTATE_INVALID) { - /* read (initial) hw pstate if not yet set */ - rdmsr(MSR_PSTATE_STATUS, lo, hi); - i = lo & HW_PSTATE_MASK; - - /* - * a workaround for family 11h erratum 311 might cause - * an "out-of-range Pstate if the core is in Pstate-0 - */ - if (i >= data->numps) - data->currpstate = HW_PSTATE_0; - else - data->currpstate = i; - } + rdmsr(MSR_PSTATE_STATUS, lo, hi); + i = lo & HW_PSTATE_MASK; + data->currpstate = i; + + /* + * a workaround for family 11h erratum 311 might cause + * an "out-of-range Pstate if the core is in Pstate-0 + */ + if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps)) + data->currpstate = HW_PSTATE_0; + return 0; } do { @@ -301,7 +299,7 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate) static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 reqvid) { - if (core_voltage_pre_transition(data, reqvid)) + if (core_voltage_pre_transition(data, reqvid, reqfid)) return 1; if (core_frequency_transition(data, reqfid)) @@ -329,17 +327,20 @@ static int transition_fid_vid(struct powernow_k8_data *data, /* Phase 1 - core voltage transition ... setup voltage */ static int core_voltage_pre_transition(struct powernow_k8_data *data, - u32 reqvid) + u32 reqvid, u32 reqfid) { u32 rvosteps = data->rvo; u32 savefid = data->currfid; - u32 maxvid, lo; + u32 maxvid, lo, rvomult = 1; dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " "reqvid 0x%x, rvo 0x%x\n", smp_processor_id(), data->currfid, data->currvid, reqvid, data->rvo); + if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP)) + rvomult = 2; + rvosteps *= rvomult; rdmsr(MSR_FIDVID_STATUS, lo, maxvid); maxvid = 0x1f & (maxvid >> 16); dprintk("ph1 maxvid=0x%x\n", maxvid); @@ -353,7 +354,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, return 1; } - while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) { + while ((rvosteps > 0) && + ((rvomult * data->rvo + data->currvid) > reqvid)) { if (data->currvid == maxvid) { rvosteps = 0; } else { @@ -386,13 +388,6 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) u32 vcoreqfid, vcocurrfid, vcofiddiff; u32 fid_interval, savevid = data->currvid; - if ((reqfid < HI_FID_TABLE_BOTTOM) && - (data->currfid < HI_FID_TABLE_BOTTOM)) { - printk(KERN_ERR PFX "ph2: illegal lo-lo transition " - "0x%x 0x%x\n", reqfid, data->currfid); - return 1; - } - if (data->currfid == reqfid) { printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", data->currfid); @@ -409,6 +404,9 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid : vcoreqfid - vcocurrfid; + if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP)) + vcofiddiff = 0; + while (vcofiddiff > 2) { (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); @@ -510,41 +508,34 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, return 0; } -static int check_supported_cpu(unsigned int cpu) +static void check_supported_cpu(void *_rc) { - cpumask_t oldmask; u32 eax, ebx, ecx, edx; - unsigned int rc = 0; + int *rc = _rc; - oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - - if (smp_processor_id() != cpu) { - printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); - goto out; - } + *rc = -ENODEV; if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) - goto out; + return; eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) && ((eax & CPUID_XFAM) < CPUID_XFAM_10H)) - goto out; + return; if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) { printk(KERN_INFO PFX "Processor cpuid %x not supported\n", eax); - goto out; + return; } eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES); if (eax < CPUID_FREQ_VOLT_CAPABILITIES) { printk(KERN_INFO PFX "No frequency change capabilities detected\n"); - goto out; + return; } cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); @@ -552,21 +543,17 @@ static int check_supported_cpu(unsigned int cpu) != P_STATE_TRANSITION_CAPABLE) { printk(KERN_INFO PFX "Power state transitions not supported\n"); - goto out; + return; } } else { /* must be a HW Pstate capable processor */ cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE) cpu_family = CPU_HW_PSTATE; else - goto out; + return; } - rc = 1; - -out: - set_cpus_allowed_ptr(current, &oldmask); - return rc; + *rc = 0; } static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, @@ -823,13 +810,14 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) return; - control = data->acpi_data.states[index].control; data->irt = (control - >> IRT_SHIFT) & IRT_MASK; data->rvo = (control >> - RVO_SHIFT) & RVO_MASK; data->exttype = (control - >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; - data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; data->vidmvs = 1 - << ((control >> MVS_SHIFT) & MVS_MASK); data->vstable = - (control >> VST_SHIFT) & VST_MASK; } + control = data->acpi_data.states[index].control; + data->irt = (control >> IRT_SHIFT) & IRT_MASK; + data->rvo = (control >> RVO_SHIFT) & RVO_MASK; + data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; + data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; + data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK); + data->vstable = (control >> VST_SHIFT) & VST_MASK; +} static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { @@ -1046,6 +1034,19 @@ static int get_transition_latency(struct powernow_k8_data *data) if (cur_latency > max_latency) max_latency = cur_latency; } + if (max_latency == 0) { + /* + * Fam 11h always returns 0 as transition latency. + * This is intended and means "very fast". While cpufreq core + * and governors currently can handle that gracefully, better + * set it to 1 to avoid problems in the future. + * For all others it's a BIOS bug. + */ + if (!boot_cpu_data.x86 == 0x11) + printk(KERN_ERR FW_WARN PFX "Invalid zero transition " + "latency\n"); + max_latency = 1; + } /* value in usecs, needs to be in nanoseconds */ return 1000 * max_latency; } @@ -1080,20 +1081,12 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, return 0; } - if ((fid < HI_FID_TABLE_BOTTOM) && - (data->currfid < HI_FID_TABLE_BOTTOM)) { - printk(KERN_ERR PFX - "ignoring illegal change in lo freq table-%x to 0x%x\n", - data->currfid, fid); - return 1; - } - dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", smp_processor_id(), fid, vid); freqs.old = find_khz_freq_from_fid(data->currfid); freqs.new = find_khz_freq_from_fid(fid); - for_each_cpu_mask_nr(i, *(data->available_cores)) { + for_each_cpu(i, data->available_cores) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -1101,7 +1094,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, res = transition_fid_vid(data, fid, vid); freqs.new = find_khz_freq_from_fid(data->currfid); - for_each_cpu_mask_nr(i, *(data->available_cores)) { + for_each_cpu(i, data->available_cores) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -1126,7 +1119,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, data->currpstate); freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - for_each_cpu_mask_nr(i, *(data->available_cores)) { + for_each_cpu(i, data->available_cores) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -1134,7 +1127,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, res = transition_pstate(data, pstate); freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - for_each_cpu_mask_nr(i, *(data->available_cores)) { + for_each_cpu(i, data->available_cores) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -1235,21 +1228,47 @@ static int powernowk8_verify(struct cpufreq_policy *pol) return cpufreq_frequency_table_verify(pol, data->powernow_table); } -static const char ACPI_PSS_BIOS_BUG_MSG[] = - KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" - KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; +struct init_on_cpu { + struct powernow_k8_data *data; + int rc; +}; + +static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu) +{ + struct init_on_cpu *init_on_cpu = _init_on_cpu; + + if (pending_bit_stuck()) { + printk(KERN_ERR PFX "failing init, change pending bit set\n"); + init_on_cpu->rc = -ENODEV; + return; + } + + if (query_current_values_with_pending_wait(init_on_cpu->data)) { + init_on_cpu->rc = -ENODEV; + return; + } + + if (cpu_family == CPU_OPTERON) + fidvid_msr_init(); + + init_on_cpu->rc = 0; +} /* per CPU init entry point to the driver */ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { + static const char ACPI_PSS_BIOS_BUG_MSG[] = + KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" + FW_BUG PFX "Try again with latest BIOS.\n"; struct powernow_k8_data *data; - cpumask_t oldmask; + struct init_on_cpu init_on_cpu; int rc; if (!cpu_online(pol->cpu)) return -ENODEV; - if (!check_supported_cpu(pol->cpu)) + smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1); + if (rc) return -ENODEV; data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); @@ -1289,27 +1308,12 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) pol->cpuinfo.transition_latency = get_transition_latency(data); /* only run on specific CPU from here on */ - oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); - - if (smp_processor_id() != pol->cpu) { - printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); - goto err_out_unmask; - } - - if (pending_bit_stuck()) { - printk(KERN_ERR PFX "failing init, change pending bit set\n"); - goto err_out_unmask; - } - - if (query_current_values_with_pending_wait(data)) - goto err_out_unmask; - - if (cpu_family == CPU_OPTERON) - fidvid_msr_init(); - - /* run on any CPU again */ - set_cpus_allowed_ptr(current, &oldmask); + init_on_cpu.data = data; + smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu, + &init_on_cpu, 1); + rc = init_on_cpu.rc; + if (rc != 0) + goto err_out_exit_acpi; if (cpu_family == CPU_HW_PSTATE) cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); @@ -1346,8 +1350,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) return 0; -err_out_unmask: - set_cpus_allowed_ptr(current, &oldmask); +err_out_exit_acpi: powernow_k8_cpu_exit_acpi(data); err_out: @@ -1372,28 +1375,25 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) return 0; } +static void query_values_on_cpu(void *_err) +{ + int *err = _err; + struct powernow_k8_data *data = __get_cpu_var(powernow_data); + + *err = query_current_values_with_pending_wait(data); +} + static unsigned int powernowk8_get(unsigned int cpu) { - struct powernow_k8_data *data; - cpumask_t oldmask = current->cpus_allowed; + struct powernow_k8_data *data = per_cpu(powernow_data, cpu); unsigned int khz = 0; - unsigned int first; - - first = cpumask_first(cpu_core_mask(cpu)); - data = per_cpu(powernow_data, first); + int err; if (!data) return -EINVAL; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - if (smp_processor_id() != cpu) { - printk(KERN_ERR PFX - "limiting to CPU %d failed in powernowk8_get\n", cpu); - set_cpus_allowed_ptr(current, &oldmask); - return 0; - } - - if (query_current_values_with_pending_wait(data)) + smp_call_function_single(cpu, query_values_on_cpu, &err, true); + if (err) goto out; if (cpu_family == CPU_HW_PSTATE) @@ -1404,7 +1404,6 @@ static unsigned int powernowk8_get(unsigned int cpu) out: - set_cpus_allowed_ptr(current, &oldmask); return khz; } @@ -1430,7 +1429,9 @@ static int __cpuinit powernowk8_init(void) unsigned int i, supported_cpus = 0; for_each_online_cpu(i) { - if (check_supported_cpu(i)) + int rc; + smp_call_function_single(i, check_supported_cpu, &rc, 1); + if (rc == 0) supported_cpus++; } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index 6c6698feade..02ce824073c 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -215,7 +215,8 @@ struct pst_s { #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) -static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid); +static int core_voltage_pre_transition(struct powernow_k8_data *data, + u32 reqvid, u32 regfid); static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); @@ -223,14 +224,3 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); - -#ifdef CONFIG_SMP -static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[]) -{ -} -#else -static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[]) -{ - cpu_set(0, cpu_sharedcore_mask[0]); -} -#endif diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 55c831ed71c..8d672ef162c 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -323,14 +323,8 @@ static unsigned int get_cur_freq(unsigned int cpu) { unsigned l, h; unsigned clock_freq; - cpumask_t saved_mask; - saved_mask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - if (smp_processor_id() != cpu) - return 0; - - rdmsr(MSR_IA32_PERF_STATUS, l, h); + rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h); clock_freq = extract_clock(l, cpu, 0); if (unlikely(clock_freq == 0)) { @@ -340,11 +334,9 @@ static unsigned int get_cur_freq(unsigned int cpu) * P-state transition (like TM2). Get the last freq set * in PERF_CTL. */ - rdmsr(MSR_IA32_PERF_CTL, l, h); + rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h); clock_freq = extract_clock(l, cpu, 1); } - - set_cpus_allowed_ptr(current, &saved_mask); return clock_freq; } @@ -467,15 +459,10 @@ static int centrino_target (struct cpufreq_policy *policy, struct cpufreq_freqs freqs; int retval = 0; unsigned int j, k, first_cpu, tmp; - cpumask_var_t saved_mask, covered_cpus; + cpumask_var_t covered_cpus; - if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) - return -ENOMEM; - if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { - free_cpumask_var(saved_mask); + if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) return -ENOMEM; - } - cpumask_copy(saved_mask, ¤t->cpus_allowed); if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { retval = -ENODEV; @@ -493,7 +480,7 @@ static int centrino_target (struct cpufreq_policy *policy, first_cpu = 1; for_each_cpu(j, policy->cpus) { - const struct cpumask *mask; + int good_cpu; /* cpufreq holds the hotplug lock, so we are safe here */ if (!cpu_online(j)) @@ -504,32 +491,30 @@ static int centrino_target (struct cpufreq_policy *policy, * Make sure we are running on CPU that wants to change freq */ if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - mask = policy->cpus; + good_cpu = cpumask_any_and(policy->cpus, + cpu_online_mask); else - mask = cpumask_of(j); + good_cpu = j; - set_cpus_allowed_ptr(current, mask); - preempt_disable(); - if (unlikely(!cpu_isset(smp_processor_id(), *mask))) { + if (good_cpu >= nr_cpu_ids) { dprintk("couldn't limit to CPUs in this domain\n"); retval = -EAGAIN; if (first_cpu) { /* We haven't started the transition yet. */ - goto migrate_end; + goto out; } - preempt_enable(); break; } msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; if (first_cpu) { - rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); + rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h); if (msr == (oldmsr & 0xffff)) { dprintk("no change needed - msr was and needs " "to be %x\n", oldmsr); retval = 0; - goto migrate_end; + goto out; } freqs.old = extract_clock(oldmsr, cpu, 0); @@ -553,14 +538,11 @@ static int centrino_target (struct cpufreq_policy *policy, oldmsr |= msr; } - wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { - preempt_enable(); + wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h); + if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) break; - } - cpu_set(j, *covered_cpus); - preempt_enable(); + cpumask_set_cpu(j, covered_cpus); } for_each_cpu(k, policy->cpus) { @@ -578,10 +560,8 @@ static int centrino_target (struct cpufreq_policy *policy, * Best effort undo.. */ - for_each_cpu_mask_nr(j, *covered_cpus) { - set_cpus_allowed_ptr(current, &cpumask_of_cpu(j)); - wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); - } + for_each_cpu(j, covered_cpus) + wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h); tmp = freqs.new; freqs.new = freqs.old; @@ -593,15 +573,9 @@ static int centrino_target (struct cpufreq_policy *policy, cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } } - set_cpus_allowed_ptr(current, saved_mask); retval = 0; - goto out; -migrate_end: - preempt_enable(); - set_cpus_allowed_ptr(current, saved_mask); out: - free_cpumask_var(saved_mask); free_cpumask_var(covered_cpus); return retval; } diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 016c1a4fa3f..6911e91fb4f 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -89,7 +89,8 @@ static int speedstep_find_register(void) * speedstep_set_state - set the SpeedStep state * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) * - * Tries to change the SpeedStep state. + * Tries to change the SpeedStep state. Can be called from + * smp_call_function_single. */ static void speedstep_set_state(unsigned int state) { @@ -143,6 +144,11 @@ static void speedstep_set_state(unsigned int state) return; } +/* Wrapper for smp_call_function_single. */ +static void _speedstep_set_state(void *_state) +{ + speedstep_set_state(*(unsigned int *)_state); +} /** * speedstep_activate - activate SpeedStep control in the chipset @@ -226,22 +232,28 @@ static unsigned int speedstep_detect_chipset(void) return 0; } -static unsigned int _speedstep_get(const struct cpumask *cpus) -{ +struct get_freq_data { unsigned int speed; - cpumask_t cpus_allowed; - - cpus_allowed = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpus); - speed = speedstep_get_frequency(speedstep_processor); - set_cpus_allowed_ptr(current, &cpus_allowed); - dprintk("detected %u kHz as current frequency\n", speed); - return speed; + unsigned int processor; +}; + +static void get_freq_data(void *_data) +{ + struct get_freq_data *data = _data; + + data->speed = speedstep_get_frequency(data->processor); } static unsigned int speedstep_get(unsigned int cpu) { - return _speedstep_get(cpumask_of(cpu)); + struct get_freq_data data = { .processor = cpu }; + + /* You're supposed to ensure CPU is online. */ + if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) + BUG(); + + dprintk("detected %u kHz as current frequency\n", data.speed); + return data.speed; } /** @@ -257,16 +269,16 @@ static int speedstep_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { - unsigned int newstate = 0; + unsigned int newstate = 0, policy_cpu; struct cpufreq_freqs freqs; - cpumask_t cpus_allowed; int i; if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate)) return -EINVAL; - freqs.old = _speedstep_get(policy->cpus); + policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); + freqs.old = speedstep_get(policy_cpu); freqs.new = speedstep_freqs[newstate].frequency; freqs.cpu = policy->cpu; @@ -276,20 +288,13 @@ static int speedstep_target(struct cpufreq_policy *policy, if (freqs.old == freqs.new) return 0; - cpus_allowed = current->cpus_allowed; - for_each_cpu(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } - /* switch to physical CPU where state is to be changed */ - set_cpus_allowed_ptr(current, policy->cpus); - - speedstep_set_state(newstate); - - /* allow to be run on all CPUs */ - set_cpus_allowed_ptr(current, &cpus_allowed); + smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate, + true); for_each_cpu(i, policy->cpus) { freqs.cpu = i; @@ -312,33 +317,43 @@ static int speedstep_verify(struct cpufreq_policy *policy) return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); } +struct get_freqs { + struct cpufreq_policy *policy; + int ret; +}; + +static void get_freqs_on_cpu(void *_get_freqs) +{ + struct get_freqs *get_freqs = _get_freqs; + + get_freqs->ret = + speedstep_get_freqs(speedstep_processor, + &speedstep_freqs[SPEEDSTEP_LOW].frequency, + &speedstep_freqs[SPEEDSTEP_HIGH].frequency, + &get_freqs->policy->cpuinfo.transition_latency, + &speedstep_set_state); +} static int speedstep_cpu_init(struct cpufreq_policy *policy) { - int result = 0; - unsigned int speed; - cpumask_t cpus_allowed; + int result; + unsigned int policy_cpu, speed; + struct get_freqs gf; /* only run on CPU to be set, or on its sibling */ #ifdef CONFIG_SMP cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); #endif - - cpus_allowed = current->cpus_allowed; - set_cpus_allowed_ptr(current, policy->cpus); + policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); /* detect low and high frequency and transition latency */ - result = speedstep_get_freqs(speedstep_processor, - &speedstep_freqs[SPEEDSTEP_LOW].frequency, - &speedstep_freqs[SPEEDSTEP_HIGH].frequency, - &policy->cpuinfo.transition_latency, - &speedstep_set_state); - set_cpus_allowed_ptr(current, &cpus_allowed); - if (result) - return result; + gf.policy = policy; + smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1); + if (gf.ret) + return gf.ret; /* get current speed setting */ - speed = _speedstep_get(policy->cpus); + speed = speedstep_get(policy_cpu); if (!speed) return -EIO; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index 2e3c6862657..f4c290b8482 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -226,6 +226,7 @@ static unsigned int pentium4_get_frequency(void) } +/* Warning: may get called from smp_call_function_single. */ unsigned int speedstep_get_frequency(unsigned int processor) { switch (processor) { diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 7437fa133c0..3260ab04499 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) */ if (c->x86 == 6 && c->x86_model < 15) clear_cpu_cap(c, X86_FEATURE_PAT); + +#ifdef CONFIG_KMEMCHECK + /* + * P4s have a "fast strings" feature which causes single- + * stepping REP instructions to only generate a #DB on + * cache-line boundaries. + * + * Ingo Molnar reported a Pentium D (model 6) and a Xeon + * (model 2) with the same problem. + */ + if (c->x86 == 15) { + u64 misc_enable; + + rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + + if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { + printk(KERN_INFO "kmemcheck: Disabling fast string operations\n"); + + misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; + wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + } + } +#endif } #ifdef CONFIG_X86_32 @@ -229,12 +252,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) } #endif -static void __cpuinit srat_detect_node(void) +static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) unsigned node; int cpu = smp_processor_id(); - int apicid = hard_smp_processor_id(); + int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; /* Don't do the funky fallback heuristics the AMD version employs for now. */ @@ -400,7 +423,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } /* Work around errata */ - srat_detect_node(); + srat_detect_node(c); if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 483eda96e10..789efe217e1 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -17,6 +17,7 @@ #include <asm/processor.h> #include <asm/smp.h> +#include <asm/k8.h> #define LVL_1_INST 1 #define LVL_1_DATA 2 @@ -159,14 +160,6 @@ struct _cpuid4_info_regs { unsigned long can_disable; }; -#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS) -static struct pci_device_id k8_nb_id[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, - {} -}; -#endif - unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same @@ -207,10 +200,17 @@ union l3_cache { }; static const unsigned short __cpuinitconst assocs[] = { - [1] = 1, [2] = 2, [4] = 4, [6] = 8, - [8] = 16, [0xa] = 32, [0xb] = 48, + [1] = 1, + [2] = 2, + [4] = 4, + [6] = 8, + [8] = 16, + [0xa] = 32, + [0xb] = 48, [0xc] = 64, - [0xf] = 0xffff // ?? + [0xd] = 96, + [0xe] = 128, + [0xf] = 0xffff /* fully associative - no way to show this currently */ }; static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; @@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.type = types[leaf]; eax->split.level = levels[leaf]; if (leaf == 3) - eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; + eax->split.num_threads_sharing = + current_cpu_data.x86_max_cores - 1; else eax->split.num_threads_sharing = 0; eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; @@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) { if (index < 3) return; + + if (boot_cpu_data.x86 == 0x11) + return; + + /* see erratum #382 */ + if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) + return; + this_leaf->can_disable = 1; } @@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -#ifdef CONFIG_PCI -static struct pci_dev *get_k8_northbridge(int node) -{ - struct pci_dev *dev = NULL; - int i; - - for (i = 0; i <= node; i++) { - do { - dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); - if (!dev) - break; - } while (!pci_match_id(&k8_nb_id[0], dev)); - if (!dev) - break; - } - return dev; -} -#else -static struct pci_dev *get_k8_northbridge(int node) -{ - return NULL; -} -#endif - -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, + unsigned int index) { - const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); - int node = cpu_to_node(cpumask_first(mask)); - struct pci_dev *dev = NULL; - ssize_t ret = 0; - int i; + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int reg = 0; if (!this_leaf->can_disable) - return sprintf(buf, "Feature not enabled\n"); - - dev = get_k8_northbridge(node); - if (!dev) { - printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); return -EINVAL; - } - for (i = 0; i < 2; i++) { - unsigned int reg; + if (!dev) + return -EINVAL; - pci_read_config_dword(dev, 0x1BC + i * 4, ®); + pci_read_config_dword(dev, 0x1BC + index * 4, ®); + return sprintf(buf, "%x\n", reg); +} - ret += sprintf(buf, "%sEntry: %d\n", buf, i); - ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", - buf, - reg & 0x80000000 ? "Disabled" : "Allowed", - reg & 0x40000000 ? "Disabled" : "Allowed"); - ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n", - buf, (reg & 0x30000) >> 16, reg & 0xfff); - } - return ret; +#define SHOW_CACHE_DISABLE(index) \ +static ssize_t \ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +{ \ + return show_cache_disable(this_leaf, buf, index); \ } +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) -static ssize_t -store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, - size_t count) +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, unsigned int index) { - const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); - int node = cpu_to_node(cpumask_first(mask)); - struct pci_dev *dev = NULL; - unsigned int ret, index, val; + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned long val = 0; + unsigned int scrubber = 0; if (!this_leaf->can_disable) - return 0; - - if (strlen(buf) > 15) return -EINVAL; - ret = sscanf(buf, "%x %x", &index, &val); - if (ret != 2) + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!dev) return -EINVAL; - if (index > 1) + + if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; val |= 0xc0000000; - dev = get_k8_northbridge(node); - if (!dev) { - printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); - return -EINVAL; - } + + pci_read_config_dword(dev, 0x58, &scrubber); + scrubber &= ~0x1f000000; + pci_write_config_dword(dev, 0x58, scrubber); pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); wbinvd(); pci_write_config_dword(dev, 0x1BC + index * 4, val); + return count; +} - return 1; +#define STORE_CACHE_DISABLE(index) \ +static ssize_t \ +store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ + const char *buf, size_t count) \ +{ \ + return store_cache_disable(this_leaf, buf, count, index); \ } +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) struct _cache_attr { struct attribute attr; @@ -808,7 +795,10 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, + show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, + show_cache_disable_1, store_cache_disable_1); static struct attribute * default_attrs[] = { &type.attr, @@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = { &size.attr, &shared_cpu_map.attr, &shared_cpu_list.attr, - &cache_disable.attr, + &cache_disable_0.attr, + &cache_disable_1.attr, NULL }; diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index b2f89829bbe..188a1ca5ad2 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,7 +1,12 @@ -obj-y = mce_$(BITS).o therm_throt.o +obj-y = mce.o -obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o -obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o -obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o +obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o +obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o +obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o +obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o +obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o +obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o + +obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index dd3af6e7b39..b945d5dbc60 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -2,25 +2,23 @@ * Athlon specific Machine Check Exception Reporting * (C) Copyright 2002 Dave Jones <davej@redhat.com> */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> #include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h> #include <linux/smp.h> #include <asm/processor.h> #include <asm/system.h> +#include <asm/mce.h> #include <asm/msr.h> -#include "mce.h" - -/* Machine Check Handler For AMD Athlon/Duron */ +/* Machine Check Handler For AMD Athlon/Duron: */ static void k7_machine_check(struct pt_regs *regs, long error_code) { - int recover = 1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; + int recover = 1; int i; rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -32,15 +30,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code) for (i = 1; i < nr_mce_banks; i++) { rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high&(1<<31)) { + if (high & (1<<31)) { char misc[20]; char addr[24]; - misc[0] = addr[0] = '\0'; + + misc[0] = '\0'; + addr[0] = '\0'; + if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; high &= ~(1<<31); + if (high & (1<<27)) { rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); snprintf(misc, 20, "[%08x%08x]", ahigh, alow); @@ -49,27 +51,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code) rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); snprintf(addr, 24, " at %08x%08x", ahigh, alow); } + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", smp_processor_id(), i, high, low, misc, addr); - /* Clear it */ + + /* Clear it: */ wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - /* Serialize */ + /* Serialize: */ wmb(); add_taint(TAINT_MACHINE_CHECK); } } - if (recover&2) + if (recover & 2) panic("CPU context corrupt"); - if (recover&1) + if (recover & 1) panic("Unable to continue"); + printk(KERN_EMERG "Attempting to continue.\n"); + mcgstl &= ~(1<<2); wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); } -/* AMD K7 machine check is Intel like */ +/* AMD K7 machine check is Intel like: */ void amd_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; @@ -79,21 +85,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c) return; machine_check_vector = k7_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); printk(KERN_INFO "Intel machine check architecture supported.\n"); + rdmsr(MSR_IA32_MCG_CAP, l, h); if (l & (1<<8)) /* Control register present ? */ wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); nr_mce_banks = l & 0xff; - /* Clear status for MC index 0 separately, we don't touch CTL, - * as some K7 Athlons cause spurious MCEs when its enabled. */ + /* + * Clear status for MC index 0 separately, we don't touch CTL, + * as some K7 Athlons cause spurious MCEs when its enabled: + */ if (boot_cpu_data.x86 == 6) { wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); i = 1; } else i = 0; + for (; i < nr_mce_banks; i++) { wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c new file mode 100644 index 00000000000..a3a235a53f0 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -0,0 +1,127 @@ +/* + * Machine check injection support. + * Copyright 2008 Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Authors: + * Andi Kleen + * Ying Huang + */ +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/timer.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/fs.h> +#include <linux/smp.h> +#include <asm/mce.h> + +/* Update fake mce registers on current CPU. */ +static void inject_mce(struct mce *m) +{ + struct mce *i = &per_cpu(injectm, m->extcpu); + + /* Make sure noone reads partially written injectm */ + i->finished = 0; + mb(); + m->finished = 0; + /* First set the fields after finished */ + i->extcpu = m->extcpu; + mb(); + /* Now write record in order, finished last (except above) */ + memcpy(i, m, sizeof(struct mce)); + /* Finally activate it */ + mb(); + i->finished = 1; +} + +struct delayed_mce { + struct timer_list timer; + struct mce m; +}; + +/* Inject mce on current CPU */ +static void raise_mce(unsigned long data) +{ + struct delayed_mce *dm = (struct delayed_mce *)data; + struct mce *m = &dm->m; + int cpu = m->extcpu; + + inject_mce(m); + if (m->status & MCI_STATUS_UC) { + struct pt_regs regs; + memset(®s, 0, sizeof(struct pt_regs)); + regs.ip = m->ip; + regs.cs = m->cs; + printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); + do_machine_check(®s, 0); + printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); + } else { + mce_banks_t b; + memset(&b, 0xff, sizeof(mce_banks_t)); + printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); + machine_check_poll(0, &b); + mce_notify_irq(); + printk(KERN_INFO "Finished machine check poll on CPU %d\n", + cpu); + } + kfree(dm); +} + +/* Error injection interface */ +static ssize_t mce_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + struct delayed_mce *dm; + struct mce m; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * There are some cases where real MSR reads could slip + * through. + */ + if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) + return -EIO; + + if ((unsigned long)usize > sizeof(struct mce)) + usize = sizeof(struct mce); + if (copy_from_user(&m, ubuf, usize)) + return -EFAULT; + + if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) + return -EINVAL; + + dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); + if (!dm) + return -ENOMEM; + + /* + * Need to give user space some time to set everything up, + * so do it a jiffie or two later everywhere. + * Should we use a hrtimer here for better synchronization? + */ + memcpy(&dm->m, &m, sizeof(struct mce)); + setup_timer(&dm->timer, raise_mce, (unsigned long)dm); + dm->timer.expires = jiffies + 2; + add_timer_on(&dm->timer, m.extcpu); + return usize; +} + +static int inject_init(void) +{ + printk(KERN_INFO "Machine check injector initialized\n"); + mce_chrdev_ops.write = mce_write; + return 0; +} + +module_init(inject_init); +/* + * Cannot tolerate unloading currently because we cannot + * guarantee all openers of mce_chrdev will get a reference to us. + */ +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h new file mode 100644 index 00000000000..54dcb8ff12e --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -0,0 +1,15 @@ +#include <asm/mce.h> + +enum severity_level { + MCE_NO_SEVERITY, + MCE_KEEP_SEVERITY, + MCE_SOME_SEVERITY, + MCE_AO_SEVERITY, + MCE_UC_SEVERITY, + MCE_AR_SEVERITY, + MCE_PANIC_SEVERITY, +}; + +int mce_severity(struct mce *a, int tolerant, char **msg); + +extern int mce_ser; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c new file mode 100644 index 00000000000..ff0807f9705 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -0,0 +1,218 @@ +/* + * MCE grading rules. + * Copyright 2008, 2009 Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Author: Andi Kleen + */ +#include <linux/kernel.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/debugfs.h> +#include <asm/mce.h> + +#include "mce-internal.h" + +/* + * Grade an mce by severity. In general the most severe ones are processed + * first. Since there are quite a lot of combinations test the bits in a + * table-driven way. The rules are simply processed in order, first + * match wins. + * + * Note this is only used for machine check exceptions, the corrected + * errors use much simpler rules. The exceptions still check for the corrected + * errors, but only to leave them alone for the CMCI handler (except for + * panic situations) + */ + +enum context { IN_KERNEL = 1, IN_USER = 2 }; +enum ser { SER_REQUIRED = 1, NO_SER = 2 }; + +static struct severity { + u64 mask; + u64 result; + unsigned char sev; + unsigned char mcgmask; + unsigned char mcgres; + unsigned char ser; + unsigned char context; + unsigned char covered; + char *msg; +} severities[] = { +#define KERNEL .context = IN_KERNEL +#define USER .context = IN_USER +#define SER .ser = SER_REQUIRED +#define NOSER .ser = NO_SER +#define SEV(s) .sev = MCE_ ## s ## _SEVERITY +#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } +#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } +#define MCGMASK(x, res, s, m, r...) \ + { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } +#define MASK(x, y, s, m, r...) \ + { .mask = x, .result = y, SEV(s), .msg = m, ## r } +#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) +#define MCACOD 0xffff + + BITCLR(MCI_STATUS_VAL, NO, "Invalid"), + BITCLR(MCI_STATUS_EN, NO, "Not enabled"), + BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), + /* When MCIP is not set something is very confused */ + MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"), + /* Neither return not error IP -- no chance to recover -> PANIC */ + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC, + "Neither restart nor error IP"), + MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", + KERNEL), + BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER), + MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME, + "Spurious not enabled", SER), + + /* ignore OVER for UCNA */ + MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP, + "Uncorrected no action required", SER), + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC, + "Illegal combination (UCNA with AR=1)", SER), + MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER), + + /* AR add known MCACODs here */ + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC, + "Action required with lost events", SER), + MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC, + "Action required; unknown MCACOD", SER), + + /* known AO MCACODs: */ + MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO, + "Action optional: memory scrubbing error", SER), + MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO, + "Action optional: last level cache writeback error", SER), + + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME, + "Action optional unknown MCACOD", SER), + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME, + "Action optional with lost events", SER), + BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), + BITSET(MCI_STATUS_UC, UC, "Uncorrected"), + BITSET(0, SOME, "No match") /* always matches. keep at end */ +}; + +/* + * If the EIPV bit is set, it means the saved IP is the + * instruction which caused the MCE. + */ +static int error_context(struct mce *m) +{ + if (m->mcgstatus & MCG_STATUS_EIPV) + return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL; + /* Unknown, assume kernel */ + return IN_KERNEL; +} + +int mce_severity(struct mce *a, int tolerant, char **msg) +{ + enum context ctx = error_context(a); + struct severity *s; + + for (s = severities;; s++) { + if ((a->status & s->mask) != s->result) + continue; + if ((a->mcgstatus & s->mcgmask) != s->mcgres) + continue; + if (s->ser == SER_REQUIRED && !mce_ser) + continue; + if (s->ser == NO_SER && mce_ser) + continue; + if (s->context && ctx != s->context) + continue; + if (msg) + *msg = s->msg; + s->covered = 1; + if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { + if (panic_on_oops || tolerant < 1) + return MCE_PANIC_SEVERITY; + } + return s->sev; + } +} + +static void *s_start(struct seq_file *f, loff_t *pos) +{ + if (*pos >= ARRAY_SIZE(severities)) + return NULL; + return &severities[*pos]; +} + +static void *s_next(struct seq_file *f, void *data, loff_t *pos) +{ + if (++(*pos) >= ARRAY_SIZE(severities)) + return NULL; + return &severities[*pos]; +} + +static void s_stop(struct seq_file *f, void *data) +{ +} + +static int s_show(struct seq_file *f, void *data) +{ + struct severity *ser = data; + seq_printf(f, "%d\t%s\n", ser->covered, ser->msg); + return 0; +} + +static const struct seq_operations severities_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static int severities_coverage_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &severities_seq_ops); +} + +static ssize_t severities_coverage_write(struct file *file, + const char __user *ubuf, + size_t count, loff_t *ppos) +{ + int i; + for (i = 0; i < ARRAY_SIZE(severities); i++) + severities[i].covered = 0; + return count; +} + +static const struct file_operations severities_coverage_fops = { + .open = severities_coverage_open, + .release = seq_release, + .read = seq_read, + .write = severities_coverage_write, +}; + +static int __init severities_debugfs_init(void) +{ + struct dentry *dmce = NULL, *fseverities_coverage = NULL; + + dmce = debugfs_create_dir("mce", NULL); + if (dmce == NULL) + goto err_out; + fseverities_coverage = debugfs_create_file("severities-coverage", + 0444, dmce, NULL, + &severities_coverage_fops); + if (fseverities_coverage == NULL) + goto err_out; + + return 0; + +err_out: + if (fseverities_coverage) + debugfs_remove(fseverities_coverage); + if (dmce) + debugfs_remove(dmce); + return -ENOMEM; +} +late_initcall(severities_debugfs_init); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c new file mode 100644 index 00000000000..01213048f62 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -0,0 +1,2060 @@ +/* + * Machine check handler. + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ +#include <linux/thread_info.h> +#include <linux/capability.h> +#include <linux/miscdevice.h> +#include <linux/interrupt.h> +#include <linux/ratelimit.h> +#include <linux/kallsyms.h> +#include <linux/rcupdate.h> +#include <linux/kobject.h> +#include <linux/uaccess.h> +#include <linux/kdebug.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/string.h> +#include <linux/sysdev.h> +#include <linux/delay.h> +#include <linux/ctype.h> +#include <linux/sched.h> +#include <linux/sysfs.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/kmod.h> +#include <linux/poll.h> +#include <linux/nmi.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/fs.h> +#include <linux/mm.h> + +#include <asm/processor.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <asm/idle.h> +#include <asm/ipi.h> +#include <asm/mce.h> +#include <asm/msr.h> + +#include "mce-internal.h" + +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; + +int mce_disabled __read_mostly; + +#ifdef CONFIG_X86_NEW_MCE + +#define MISC_MCELOG_MINOR 227 + +#define SPINUNIT 100 /* 100ns */ + +atomic_t mce_entry; + +DEFINE_PER_CPU(unsigned, mce_exception_count); + +/* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ +static int tolerant __read_mostly = 1; +static int banks __read_mostly; +static u64 *bank __read_mostly; +static int rip_msr __read_mostly; +static int mce_bootlog __read_mostly = -1; +static int monarch_timeout __read_mostly = -1; +static int mce_panic_timeout __read_mostly; +static int mce_dont_log_ce __read_mostly; +int mce_cmci_disabled __read_mostly; +int mce_ignore_ce __read_mostly; +int mce_ser __read_mostly; + +/* User mode helper program triggered by machine check event */ +static unsigned long mce_need_notify; +static char mce_helper[128]; +static char *mce_helper_argv[2] = { mce_helper, NULL }; + +static unsigned long dont_init_banks; + +static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +static DEFINE_PER_CPU(struct mce, mces_seen); +static int cpu_missing; + + +/* MCA banks polled by the period polling timer for corrected events */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { + [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + +static inline int skip_bank_init(int i) +{ + return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); +} + +static DEFINE_PER_CPU(struct work_struct, mce_work); + +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + m->cpu = m->extcpu = smp_processor_id(); + rdtscll(m->tsc); + /* We hope get_seconds stays lockless */ + m->time = get_seconds(); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->cpuid = cpuid_eax(1); +#ifdef CONFIG_SMP + m->socketid = cpu_data(m->extcpu).phys_proc_id; +#endif + m->apicid = cpu_data(m->extcpu).initial_apicid; + rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); +} + +DEFINE_PER_CPU(struct mce, injectm); +EXPORT_PER_CPU_SYMBOL_GPL(injectm); + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +static struct mce_log mcelog = { + .signature = MCE_LOG_SIGNATURE, + .len = MCE_LOG_LEN, + .recordlen = sizeof(struct mce), +}; + +void mce_log(struct mce *mce) +{ + unsigned next, entry; + + mce->finished = 0; + wmb(); + for (;;) { + entry = rcu_dereference(mcelog.next); + for (;;) { + /* + * When the buffer fills up discard new entries. + * Assume that the earlier errors are the more + * interesting ones: + */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, + (unsigned long *)&mcelog.flags); + return; + } + /* Old left over entry. Skip: */ + if (mcelog.entry[entry].finished) { + entry++; + continue; + } + break; + } + smp_rmb(); + next = entry + 1; + if (cmpxchg(&mcelog.next, entry, next) == entry) + break; + } + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); + wmb(); + mcelog.entry[entry].finished = 1; + wmb(); + + mce->finished = 1; + set_bit(0, &mce_need_notify); +} + +static void print_mce(struct mce *m) +{ + printk(KERN_EMERG + "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + m->extcpu, m->mcgstatus, m->bank, m->status); + if (m->ip) { + printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->ip); + if (m->cs == __KERNEL_CS) + print_symbol("{%s}", m->ip); + printk(KERN_CONT "\n"); + } + printk(KERN_EMERG "TSC %llx ", m->tsc); + if (m->addr) + printk(KERN_CONT "ADDR %llx ", m->addr); + if (m->misc) + printk(KERN_CONT "MISC %llx ", m->misc); + printk(KERN_CONT "\n"); + printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, + m->apicid); +} + +static void print_mce_head(void) +{ + printk(KERN_EMERG "\nHARDWARE ERROR\n"); +} + +static void print_mce_tail(void) +{ + printk(KERN_EMERG "This is not a software problem!\n" + "Run through mcelog --ascii to decode and contact your hardware vendor\n"); +} + +#define PANIC_TIMEOUT 5 /* 5 seconds */ + +static atomic_t mce_paniced; + +/* Panic in progress. Enable interrupts and wait for final IPI */ +static void wait_for_panic(void) +{ + long timeout = PANIC_TIMEOUT*USEC_PER_SEC; + preempt_disable(); + local_irq_enable(); + while (timeout-- > 0) + udelay(1); + if (panic_timeout == 0) + panic_timeout = mce_panic_timeout; + panic("Panicing machine check CPU died"); +} + +static void mce_panic(char *msg, struct mce *final, char *exp) +{ + int i; + + /* + * Make sure only one CPU runs in machine check panic + */ + if (atomic_add_return(1, &mce_paniced) > 1) + wait_for_panic(); + barrier(); + + bust_spinlocks(1); + console_verbose(); + print_mce_head(); + /* First print corrected ones that are still unlogged */ + for (i = 0; i < MCE_LOG_LEN; i++) { + struct mce *m = &mcelog.entry[i]; + if (!(m->status & MCI_STATUS_VAL)) + continue; + if (!(m->status & MCI_STATUS_UC)) + print_mce(m); + } + /* Now print uncorrected but with the final one last */ + for (i = 0; i < MCE_LOG_LEN; i++) { + struct mce *m = &mcelog.entry[i]; + if (!(m->status & MCI_STATUS_VAL)) + continue; + if (!(m->status & MCI_STATUS_UC)) + continue; + if (!final || memcmp(m, final, sizeof(struct mce))) + print_mce(m); + } + if (final) + print_mce(final); + if (cpu_missing) + printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); + print_mce_tail(); + if (exp) + printk(KERN_EMERG "Machine check: %s\n", exp); + if (panic_timeout == 0) + panic_timeout = mce_panic_timeout; + panic(msg); +} + +/* Support code for software error injection */ + +static int msr_to_offset(u32 msr) +{ + unsigned bank = __get_cpu_var(injectm.bank); + if (msr == rip_msr) + return offsetof(struct mce, ip); + if (msr == MSR_IA32_MC0_STATUS + bank*4) + return offsetof(struct mce, status); + if (msr == MSR_IA32_MC0_ADDR + bank*4) + return offsetof(struct mce, addr); + if (msr == MSR_IA32_MC0_MISC + bank*4) + return offsetof(struct mce, misc); + if (msr == MSR_IA32_MCG_STATUS) + return offsetof(struct mce, mcgstatus); + return -1; +} + +/* MSR access wrappers used for error injection */ +static u64 mce_rdmsrl(u32 msr) +{ + u64 v; + if (__get_cpu_var(injectm).finished) { + int offset = msr_to_offset(msr); + if (offset < 0) + return 0; + return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); + } + rdmsrl(msr, v); + return v; +} + +static void mce_wrmsrl(u32 msr, u64 v) +{ + if (__get_cpu_var(injectm).finished) { + int offset = msr_to_offset(msr); + if (offset >= 0) + *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; + return; + } + wrmsrl(msr, v); +} + +/* + * Simple lockless ring to communicate PFNs from the exception handler with the + * process context work function. This is vastly simplified because there's + * only a single reader and a single writer. + */ +#define MCE_RING_SIZE 16 /* we use one entry less */ + +struct mce_ring { + unsigned short start; + unsigned short end; + unsigned long ring[MCE_RING_SIZE]; +}; +static DEFINE_PER_CPU(struct mce_ring, mce_ring); + +/* Runs with CPU affinity in workqueue */ +static int mce_ring_empty(void) +{ + struct mce_ring *r = &__get_cpu_var(mce_ring); + + return r->start == r->end; +} + +static int mce_ring_get(unsigned long *pfn) +{ + struct mce_ring *r; + int ret = 0; + + *pfn = 0; + get_cpu(); + r = &__get_cpu_var(mce_ring); + if (r->start == r->end) + goto out; + *pfn = r->ring[r->start]; + r->start = (r->start + 1) % MCE_RING_SIZE; + ret = 1; +out: + put_cpu(); + return ret; +} + +/* Always runs in MCE context with preempt off */ +static int mce_ring_add(unsigned long pfn) +{ + struct mce_ring *r = &__get_cpu_var(mce_ring); + unsigned next; + + next = (r->end + 1) % MCE_RING_SIZE; + if (next == r->start) + return -1; + r->ring[r->end] = pfn; + wmb(); + r->end = next; + return 0; +} + +int mce_available(struct cpuinfo_x86 *c) +{ + if (mce_disabled) + return 0; + return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); +} + +static void mce_schedule_work(void) +{ + if (!mce_ring_empty()) { + struct work_struct *work = &__get_cpu_var(mce_work); + if (!work_pending(work)) + schedule_work(work); + } +} + +/* + * Get the address of the instruction at the time of the machine check + * error. + */ +static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) +{ + + if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { + m->ip = regs->ip; + m->cs = regs->cs; + } else { + m->ip = 0; + m->cs = 0; + } + if (rip_msr) + m->ip = mce_rdmsrl(rip_msr); +} + +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Called after interrupts have been reenabled again + * when a MCE happened during an interrupts off region + * in the kernel. + */ +asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) +{ + ack_APIC_irq(); + exit_idle(); + irq_enter(); + mce_notify_irq(); + mce_schedule_work(); + irq_exit(); +} +#endif + +static void mce_report_event(struct pt_regs *regs) +{ + if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { + mce_notify_irq(); + /* + * Triggering the work queue here is just an insurance + * policy in case the syscall exit notify handler + * doesn't run soon enough or ends up running on the + * wrong CPU (can happen when audit sleeps) + */ + mce_schedule_work(); + return; + } + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Without APIC do not notify. The event will be picked + * up eventually. + */ + if (!cpu_has_apic) + return; + + /* + * When interrupts are disabled we cannot use + * kernel services safely. Trigger an self interrupt + * through the APIC to instead do the notification + * after interrupts are reenabled again. + */ + apic->send_IPI_self(MCE_SELF_VECTOR); + + /* + * Wait for idle afterwards again so that we don't leave the + * APIC in a non idle state because the normal APIC writes + * cannot exclude us. + */ + apic_wait_icr_idle(); +#endif +} + +DEFINE_PER_CPU(unsigned, mce_poll_count); + +/* + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + * + * Note: spec recommends to panic for fatal unsignalled + * errors here. However this would be quite problematic -- + * we would need to reimplement the Monarch handling and + * it would mess up the exclusion between exception handler + * and poll hander -- * so we skip this for now. + * These cases should not happen anyways, or only when the CPU + * is already totally * confused. In this case it's likely it will + * not fully execute the machine check handler either. + */ +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +{ + struct mce m; + int i; + + __get_cpu_var(mce_poll_count)++; + + mce_setup(&m); + + m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + for (i = 0; i < banks; i++) { + if (!bank[i] || !test_bit(i, *b)) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + m.tsc = 0; + + barrier(); + m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + if (!(m.status & MCI_STATUS_VAL)) + continue; + + /* + * Uncorrected or signalled events are handled by the exception + * handler when it is enabled, so don't process those here. + * + * TBD do the same check for MCI_STATUS_EN here? + */ + if (!(flags & MCP_UC) && + (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) + continue; + + if (m.status & MCI_STATUS_MISCV) + m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + if (m.status & MCI_STATUS_ADDRV) + m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + + if (!(flags & MCP_TIMESTAMP)) + m.tsc = 0; + /* + * Don't get the IP here because it's unlikely to + * have anything to do with the actual error location. + */ + if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { + mce_log(&m); + add_taint(TAINT_MACHINE_CHECK); + } + + /* + * Clear state for this bank. + */ + mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } + + /* + * Don't clear MCG_STATUS here because it's only defined for + * exceptions. + */ + + sync_core(); +} +EXPORT_SYMBOL_GPL(machine_check_poll); + +/* + * Do a quick check if any of the events requires a panic. + * This decides if we keep the events around or clear them. + */ +static int mce_no_way_out(struct mce *m, char **msg) +{ + int i; + + for (i = 0; i < banks; i++) { + m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) + return 1; + } + return 0; +} + +/* + * Variable to establish order between CPUs while scanning. + * Each CPU spins initially until executing is equal its number. + */ +static atomic_t mce_executing; + +/* + * Defines order of CPUs on entry. First CPU becomes Monarch. + */ +static atomic_t mce_callin; + +/* + * Check if a timeout waiting for other CPUs happened. + */ +static int mce_timed_out(u64 *t) +{ + /* + * The others already did panic for some reason. + * Bail out like in a timeout. + * rmb() to tell the compiler that system_state + * might have been modified by someone else. + */ + rmb(); + if (atomic_read(&mce_paniced)) + wait_for_panic(); + if (!monarch_timeout) + goto out; + if ((s64)*t < SPINUNIT) { + /* CHECKME: Make panic default for 1 too? */ + if (tolerant < 1) + mce_panic("Timeout synchronizing machine check over CPUs", + NULL, NULL); + cpu_missing = 1; + return 1; + } + *t -= SPINUNIT; +out: + touch_nmi_watchdog(); + return 0; +} + +/* + * The Monarch's reign. The Monarch is the CPU who entered + * the machine check handler first. It waits for the others to + * raise the exception too and then grades them. When any + * error is fatal panic. Only then let the others continue. + * + * The other CPUs entering the MCE handler will be controlled by the + * Monarch. They are called Subjects. + * + * This way we prevent any potential data corruption in a unrecoverable case + * and also makes sure always all CPU's errors are examined. + * + * Also this detects the case of an machine check event coming from outer + * space (not detected by any CPUs) In this case some external agent wants + * us to shut down, so panic too. + * + * The other CPUs might still decide to panic if the handler happens + * in a unrecoverable place, but in this case the system is in a semi-stable + * state and won't corrupt anything by itself. It's ok to let the others + * continue for a bit first. + * + * All the spin loops have timeouts; when a timeout happens a CPU + * typically elects itself to be Monarch. + */ +static void mce_reign(void) +{ + int cpu; + struct mce *m = NULL; + int global_worst = 0; + char *msg = NULL; + char *nmsg = NULL; + + /* + * This CPU is the Monarch and the other CPUs have run + * through their handlers. + * Grade the severity of the errors of all the CPUs. + */ + for_each_possible_cpu(cpu) { + int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, + &nmsg); + if (severity > global_worst) { + msg = nmsg; + global_worst = severity; + m = &per_cpu(mces_seen, cpu); + } + } + + /* + * Cannot recover? Panic here then. + * This dumps all the mces in the log buffer and stops the + * other CPUs. + */ + if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) + mce_panic("Fatal Machine check", m, msg); + + /* + * For UC somewhere we let the CPU who detects it handle it. + * Also must let continue the others, otherwise the handling + * CPU could deadlock on a lock. + */ + + /* + * No machine check event found. Must be some external + * source or one CPU is hung. Panic. + */ + if (!m && tolerant < 3) + mce_panic("Machine check from unknown source", NULL, NULL); + + /* + * Now clear all the mces_seen so that they don't reappear on + * the next mce. + */ + for_each_possible_cpu(cpu) + memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); +} + +static atomic_t global_nwo; + +/* + * Start of Monarch synchronization. This waits until all CPUs have + * entered the exception handler and then determines if any of them + * saw a fatal event that requires panic. Then it executes them + * in the entry order. + * TBD double check parallel CPU hotunplug + */ +static int mce_start(int *no_way_out) +{ + int order; + int cpus = num_online_cpus(); + u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + + if (!timeout) + return -1; + + atomic_add(*no_way_out, &global_nwo); + /* + * global_nwo should be updated before mce_callin + */ + smp_wmb(); + order = atomic_add_return(1, &mce_callin); + + /* + * Wait for everyone. + */ + while (atomic_read(&mce_callin) != cpus) { + if (mce_timed_out(&timeout)) { + atomic_set(&global_nwo, 0); + return -1; + } + ndelay(SPINUNIT); + } + + /* + * mce_callin should be read before global_nwo + */ + smp_rmb(); + + if (order == 1) { + /* + * Monarch: Starts executing now, the others wait. + */ + atomic_set(&mce_executing, 1); + } else { + /* + * Subject: Now start the scanning loop one by one in + * the original callin order. + * This way when there are any shared banks it will be + * only seen by one CPU before cleared, avoiding duplicates. + */ + while (atomic_read(&mce_executing) < order) { + if (mce_timed_out(&timeout)) { + atomic_set(&global_nwo, 0); + return -1; + } + ndelay(SPINUNIT); + } + } + + /* + * Cache the global no_way_out state. + */ + *no_way_out = atomic_read(&global_nwo); + + return order; +} + +/* + * Synchronize between CPUs after main scanning loop. + * This invokes the bulk of the Monarch processing. + */ +static int mce_end(int order) +{ + int ret = -1; + u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + + if (!timeout) + goto reset; + if (order < 0) + goto reset; + + /* + * Allow others to run. + */ + atomic_inc(&mce_executing); + + if (order == 1) { + /* CHECKME: Can this race with a parallel hotplug? */ + int cpus = num_online_cpus(); + + /* + * Monarch: Wait for everyone to go through their scanning + * loops. + */ + while (atomic_read(&mce_executing) <= cpus) { + if (mce_timed_out(&timeout)) + goto reset; + ndelay(SPINUNIT); + } + + mce_reign(); + barrier(); + ret = 0; + } else { + /* + * Subject: Wait for Monarch to finish. + */ + while (atomic_read(&mce_executing) != 0) { + if (mce_timed_out(&timeout)) + goto reset; + ndelay(SPINUNIT); + } + + /* + * Don't reset anything. That's done by the Monarch. + */ + return 0; + } + + /* + * Reset all global state. + */ +reset: + atomic_set(&global_nwo, 0); + atomic_set(&mce_callin, 0); + barrier(); + + /* + * Let others run again. + */ + atomic_set(&mce_executing, 0); + return ret; +} + +/* + * Check if the address reported by the CPU is in a format we can parse. + * It would be possible to add code for most other cases, but all would + * be somewhat complicated (e.g. segment offset would require an instruction + * parser). So only support physical addresses upto page granuality for now. + */ +static int mce_usable_address(struct mce *m) +{ + if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) + return 0; + if ((m->misc & 0x3f) > PAGE_SHIFT) + return 0; + if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) + return 0; + return 1; +} + +static void mce_clear_state(unsigned long *toclear) +{ + int i; + + for (i = 0; i < banks; i++) { + if (test_bit(i, toclear)) + mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } +} + +/* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. + * + * This is executed in NMI context not subject to normal locking rules. This + * implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there! + * + * On Intel systems this is entered on all CPUs in parallel through + * MCE broadcast. However some CPUs might be broken beyond repair, + * so be always careful when synchronizing with others. + */ +void do_machine_check(struct pt_regs *regs, long error_code) +{ + struct mce m, *final; + int i; + int worst = 0; + int severity; + /* + * Establish sequential order between the CPUs entering the machine + * check handler. + */ + int order; + /* + * If no_way_out gets set, there is no safe way to recover from this + * MCE. If tolerant is cranked up, we'll try anyway. + */ + int no_way_out = 0; + /* + * If kill_it gets set, there might be a way to recover from this + * error. + */ + int kill_it = 0; + DECLARE_BITMAP(toclear, MAX_NR_BANKS); + char *msg = "Unknown"; + + atomic_inc(&mce_entry); + + __get_cpu_var(mce_exception_count)++; + + if (notify_die(DIE_NMI, "machine check", regs, error_code, + 18, SIGKILL) == NOTIFY_STOP) + goto out; + if (!banks) + goto out; + + mce_setup(&m); + + m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + no_way_out = mce_no_way_out(&m, &msg); + + final = &__get_cpu_var(mces_seen); + *final = m; + + barrier(); + + /* + * When no restart IP must always kill or panic. + */ + if (!(m.mcgstatus & MCG_STATUS_RIPV)) + kill_it = 1; + + /* + * Go through all the banks in exclusion of the other CPUs. + * This way we don't report duplicated events on shared banks + * because the first one to see it will clear it. + */ + order = mce_start(&no_way_out); + for (i = 0; i < banks; i++) { + __clear_bit(i, toclear); + if (!bank[i]) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + + m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + if ((m.status & MCI_STATUS_VAL) == 0) + continue; + + /* + * Non uncorrected or non signaled errors are handled by + * machine_check_poll. Leave them alone, unless this panics. + */ + if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + !no_way_out) + continue; + + /* + * Set taint even when machine check was not enabled. + */ + add_taint(TAINT_MACHINE_CHECK); + + severity = mce_severity(&m, tolerant, NULL); + + /* + * When machine check was for corrected handler don't touch, + * unless we're panicing. + */ + if (severity == MCE_KEEP_SEVERITY && !no_way_out) + continue; + __set_bit(i, toclear); + if (severity == MCE_NO_SEVERITY) { + /* + * Machine check event was not enabled. Clear, but + * ignore. + */ + continue; + } + + /* + * Kill on action required. + */ + if (severity == MCE_AR_SEVERITY) + kill_it = 1; + + if (m.status & MCI_STATUS_MISCV) + m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + if (m.status & MCI_STATUS_ADDRV) + m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + + /* + * Action optional error. Queue address for later processing. + * When the ring overflows we just ignore the AO error. + * RED-PEN add some logging mechanism when + * usable_address or mce_add_ring fails. + * RED-PEN don't ignore overflow for tolerant == 0 + */ + if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) + mce_ring_add(m.addr >> PAGE_SHIFT); + + mce_get_rip(&m, regs); + mce_log(&m); + + if (severity > worst) { + *final = m; + worst = severity; + } + } + + if (!no_way_out) + mce_clear_state(toclear); + + /* + * Do most of the synchronization with other CPUs. + * When there's any problem use only local no_way_out state. + */ + if (mce_end(order) < 0) + no_way_out = worst >= MCE_PANIC_SEVERITY; + + /* + * If we have decided that we just CAN'T continue, and the user + * has not set tolerant to an insane level, give up and die. + * + * This is mainly used in the case when the system doesn't + * support MCE broadcasting or it has been disabled. + */ + if (no_way_out && tolerant < 3) + mce_panic("Fatal machine check on current CPU", final, msg); + + /* + * If the error seems to be unrecoverable, something should be + * done. Try to kill as little as possible. If we can kill just + * one task, do that. If the user has set the tolerance very + * high, don't try to do anything at all. + */ + + if (kill_it && tolerant < 3) + force_sig(SIGBUS, current); + + /* notify userspace ASAP */ + set_thread_flag(TIF_MCE_NOTIFY); + + if (worst > 0) + mce_report_event(regs); + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); +out: + atomic_dec(&mce_entry); + sync_core(); +} +EXPORT_SYMBOL_GPL(do_machine_check); + +/* dummy to break dependency. actual code is in mm/memory-failure.c */ +void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) +{ + printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); +} + +/* + * Called after mce notification in process context. This code + * is allowed to sleep. Call the high level VM handler to process + * any corrupted pages. + * Assume that the work queue code only calls this one at a time + * per CPU. + * Note we don't disable preemption, so this code might run on the wrong + * CPU. In this case the event is picked up by the scheduled work queue. + * This is merely a fast path to expedite processing in some common + * cases. + */ +void mce_notify_process(void) +{ + unsigned long pfn; + mce_notify_irq(); + while (mce_ring_get(&pfn)) + memory_failure(pfn, MCE_VECTOR); +} + +static void mce_process_work(struct work_struct *dummy) +{ + mce_notify_process(); +} + +#ifdef CONFIG_X86_MCE_INTEL +/*** + * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog + * @cpu: The CPU on which the event occurred. + * @status: Event status information + * + * This function should be called by the thermal interrupt after the + * event has been processed and the decision was made to log the event + * further. + * + * The status parameter will be saved to the 'status' field of 'struct mce' + * and historically has been the register value of the + * MSR_IA32_THERMAL_STATUS (Intel) msr. + */ +void mce_log_therm_throt_event(__u64 status) +{ + struct mce m; + + mce_setup(&m); + m.bank = MCE_THERMAL_BANK; + m.status = status; + mce_log(&m); +} +#endif /* CONFIG_X86_MCE_INTEL */ + +/* + * Periodic polling timer for "silent" machine check errors. If the + * poller finds an MCE, poll 2x faster. When the poller finds no more + * errors, poll 2x slower (up to check_interval seconds). + */ +static int check_interval = 5 * 60; /* 5 minutes */ + +static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ +static DEFINE_PER_CPU(struct timer_list, mce_timer); + +static void mcheck_timer(unsigned long data) +{ + struct timer_list *t = &per_cpu(mce_timer, data); + int *n; + + WARN_ON(smp_processor_id() != data); + + if (mce_available(¤t_cpu_data)) { + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); + } + + /* + * Alert userspace if needed. If we logged an MCE, reduce the + * polling interval, otherwise increase the polling interval. + */ + n = &__get_cpu_var(next_interval); + if (mce_notify_irq()) + *n = max(*n/2, HZ/100); + else + *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); + + t->expires = jiffies + *n; + add_timer_on(t, smp_processor_id()); +} + +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + +/* + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. + */ +int mce_notify_irq(void) +{ + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + + clear_thread_flag(TIF_MCE_NOTIFY); + + if (test_and_clear_bit(0, &mce_need_notify)) { + wake_up_interruptible(&mce_wait); + + /* + * There is no risk of missing notifications because + * work_pending is always cleared before the function is + * executed. + */ + if (mce_helper[0] && !work_pending(&mce_trigger_work)) + schedule_work(&mce_trigger_work); + + if (__ratelimit(&ratelimit)) + printk(KERN_INFO "Machine check events logged\n"); + + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(mce_notify_irq); + +/* + * Initialize Machine Checks for a CPU. + */ +static int mce_cap_init(void) +{ + unsigned b; + u64 cap; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + + b = cap & MCG_BANKCNT_MASK; + printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + + if (b > MAX_NR_BANKS) { + printk(KERN_WARNING + "MCE: Using only %u machine check banks out of %u\n", + MAX_NR_BANKS, b); + b = MAX_NR_BANKS; + } + + /* Don't support asymmetric configurations today */ + WARN_ON(banks != 0 && b != banks); + banks = b; + if (!bank) { + bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); + if (!bank) + return -ENOMEM; + memset(bank, 0xff, banks * sizeof(u64)); + } + + /* Use accurate RIP reporting if available. */ + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) + rip_msr = MSR_IA32_MCG_EIP; + + if (cap & MCG_SER_P) + mce_ser = 1; + + return 0; +} + +static void mce_init(void) +{ + mce_banks_t all_banks; + u64 cap; + int i; + + /* + * Log the machine checks left over from the previous reset. + */ + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); + + set_in_cr4(X86_CR4_MCE); + + rdmsrl(MSR_IA32_MCG_CAP, cap); + if (cap & MCG_CTL_P) + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + + for (i = 0; i < banks; i++) { + if (skip_bank_init(i)) + continue; + wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } +} + +/* Add per CPU specific workarounds here */ +static int mce_cpu_quirks(struct cpuinfo_x86 *c) +{ + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); + return -EOPNOTSUPP; + } + + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86_vendor == X86_VENDOR_AMD) { + if (c->x86 == 15 && banks > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ + clear_bit(10, (unsigned long *)&bank[4]); + } + if (c->x86 <= 17 && mce_bootlog < 0) { + /* + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: + */ + mce_bootlog = 0; + } + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && banks > 0) + bank[0] = 0; + } + + if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + */ + + if (c->x86 == 6 && c->x86_model < 0x1A) + __set_bit(0, &dont_init_banks); + + /* + * All newer Intel systems support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && + monarch_timeout < 0) + monarch_timeout = USEC_PER_SEC; + + /* + * There are also broken BIOSes on some Pentium M and + * earlier systems: + */ + if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) + mce_bootlog = 0; + } + if (monarch_timeout < 0) + monarch_timeout = 0; + if (mce_bootlog != 0) + mce_panic_timeout = 30; + + return 0; +} + +static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) +{ + if (c->x86 != 5) + return; + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + intel_p5_mcheck_init(c); + break; + case X86_VENDOR_CENTAUR: + winchip_mcheck_init(c); + break; + } +} + +static void mce_cpu_features(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + break; + case X86_VENDOR_AMD: + mce_amd_feature_init(c); + break; + default: + break; + } +} + +static void mce_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + int *n = &__get_cpu_var(next_interval); + + if (mce_ignore_ce) + return; + + *n = check_interval * HZ; + if (!*n) + return; + setup_timer(t, mcheck_timer, smp_processor_id()); + t->expires = round_jiffies(jiffies + *n); + add_timer_on(t, smp_processor_id()); +} + +/* + * Called for each booted CPU to set up machine checks. + * Must be called with preempt off: + */ +void __cpuinit mcheck_init(struct cpuinfo_x86 *c) +{ + if (mce_disabled) + return; + + mce_ancient_init(c); + + if (!mce_available(c)) + return; + + if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { + mce_disabled = 1; + return; + } + + machine_check_vector = do_machine_check; + + mce_init(); + mce_cpu_features(c); + mce_init_timer(); + INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); +} + +/* + * Character device to read and clear the MCE log. + */ + +static DEFINE_SPINLOCK(mce_state_lock); +static int open_count; /* #times opened */ +static int open_exclu; /* already open exclusive? */ + +static int mce_open(struct inode *inode, struct file *file) +{ + spin_lock(&mce_state_lock); + + if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_state_lock); + + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + open_exclu = 1; + open_count++; + + spin_unlock(&mce_state_lock); + + return nonseekable_open(inode, file); +} + +static int mce_release(struct inode *inode, struct file *file) +{ + spin_lock(&mce_state_lock); + + open_count--; + open_exclu = 0; + + spin_unlock(&mce_state_lock); + + return 0; +} + +static void collect_tscs(void *data) +{ + unsigned long *cpu_tsc = (unsigned long *)data; + + rdtscll(cpu_tsc[smp_processor_id()]); +} + +static DEFINE_MUTEX(mce_read_mutex); + +static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, + loff_t *off) +{ + char __user *buf = ubuf; + unsigned long *cpu_tsc; + unsigned prev, next; + int i, err; + + cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); + if (!cpu_tsc) + return -ENOMEM; + + mutex_lock(&mce_read_mutex); + next = rcu_dereference(mcelog.next); + + /* Only supports full reads right now */ + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { + mutex_unlock(&mce_read_mutex); + kfree(cpu_tsc); + + return -EINVAL; + } + + err = 0; + prev = 0; + do { + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + + while (!mcelog.entry[i].finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(mcelog.entry + i, 0, + sizeof(struct mce)); + goto timeout; + } + cpu_relax(); + } + smp_rmb(); + err |= copy_to_user(buf, mcelog.entry + i, + sizeof(struct mce)); + buf += sizeof(struct mce); +timeout: + ; + } + + memset(mcelog.entry + prev, 0, + (next - prev) * sizeof(struct mce)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); + + synchronize_sched(); + + /* + * Collect entries that were still getting written before the + * synchronize. + */ + on_each_cpu(collect_tscs, cpu_tsc, 1); + + for (i = next; i < MCE_LOG_LEN; i++) { + if (mcelog.entry[i].finished && + mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { + err |= copy_to_user(buf, mcelog.entry+i, + sizeof(struct mce)); + smp_rmb(); + buf += sizeof(struct mce); + memset(&mcelog.entry[i], 0, sizeof(struct mce)); + } + } + mutex_unlock(&mce_read_mutex); + kfree(cpu_tsc); + + return err ? -EFAULT : buf - ubuf; +} + +static unsigned int mce_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &mce_wait, wait); + if (rcu_dereference(mcelog.next)) + return POLLIN | POLLRDNORM; + return 0; +} + +static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + int __user *p = (int __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + + do { + flags = mcelog.flags; + } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +/* Modified in mce-inject.c, so not static or const */ +struct file_operations mce_chrdev_ops = { + .open = mce_open, + .release = mce_release, + .read = mce_read, + .poll = mce_poll, + .unlocked_ioctl = mce_ioctl, +}; +EXPORT_SYMBOL_GPL(mce_chrdev_ops); + +static struct miscdevice mce_log_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +/* + * mce=off Disables machine check + * mce=no_cmci Disables CMCI + * mce=dont_log_ce Clears corrected events silently, no log created for CEs. + * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. + * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) + * monarchtimeout is how long to wait for other CPUs on machine + * check, or 0 to not wait + * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + * mce=nobootlog Don't log MCEs from before booting. + */ +static int __init mcheck_enable(char *str) +{ + if (*str == 0) + enable_p5_mce(); + if (*str == '=') + str++; + if (!strcmp(str, "off")) + mce_disabled = 1; + else if (!strcmp(str, "no_cmci")) + mce_cmci_disabled = 1; + else if (!strcmp(str, "dont_log_ce")) + mce_dont_log_ce = 1; + else if (!strcmp(str, "ignore_ce")) + mce_ignore_ce = 1; + else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) + mce_bootlog = (str[0] == 'b'); + else if (isdigit(str[0])) { + get_option(&str, &tolerant); + if (*str == ',') { + ++str; + get_option(&str, &monarch_timeout); + } + } else { + printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", + str); + return 0; + } + return 1; +} +__setup("mce", mcheck_enable); + +/* + * Sysfs support + */ + +/* + * Disable machine checks on suspend and shutdown. We can't really handle + * them later. + */ +static int mce_disable(void) +{ + int i; + + for (i = 0; i < banks; i++) { + if (!skip_bank_init(i)) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + } + return 0; +} + +static int mce_suspend(struct sys_device *dev, pm_message_t state) +{ + return mce_disable(); +} + +static int mce_shutdown(struct sys_device *dev) +{ + return mce_disable(); +} + +/* + * On resume clear all MCE state. Don't want to see leftovers from the BIOS. + * Only one CPU is active at this time, the others get re-added later using + * CPU hotplug: + */ +static int mce_resume(struct sys_device *dev) +{ + mce_init(); + mce_cpu_features(¤t_cpu_data); + + return 0; +} + +static void mce_cpu_restart(void *data) +{ + del_timer_sync(&__get_cpu_var(mce_timer)); + if (!mce_available(¤t_cpu_data)) + return; + mce_init(); + mce_init_timer(); +} + +/* Reinit MCEs after user configuration changes */ +static void mce_restart(void) +{ + on_each_cpu(mce_cpu_restart, NULL, 1); +} + +/* Toggle features for corrected errors */ +static void mce_disable_ce(void *all) +{ + if (!mce_available(¤t_cpu_data)) + return; + if (all) + del_timer_sync(&__get_cpu_var(mce_timer)); + cmci_clear(); +} + +static void mce_enable_ce(void *all) +{ + if (!mce_available(¤t_cpu_data)) + return; + cmci_reenable(); + cmci_recheck(); + if (all) + mce_init_timer(); +} + +static struct sysdev_class mce_sysclass = { + .suspend = mce_suspend, + .shutdown = mce_shutdown, + .resume = mce_resume, + .name = "machinecheck", +}; + +DEFINE_PER_CPU(struct sys_device, mce_dev); + +__cpuinitdata +void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); + +static struct sysdev_attribute *bank_attrs; + +static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, + char *buf) +{ + u64 b = bank[attr - bank_attrs]; + + return sprintf(buf, "%llx\n", b); +} + +static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (strict_strtoull(buf, 0, &new) < 0) + return -EINVAL; + + bank[attr - bank_attrs] = new; + mce_restart(); + + return size; +} + +static ssize_t +show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) +{ + strcpy(buf, mce_helper); + strcat(buf, "\n"); + return strlen(mce_helper) + 1; +} + +static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf, size_t siz) +{ + char *p; + + strncpy(mce_helper, buf, sizeof(mce_helper)); + mce_helper[sizeof(mce_helper)-1] = 0; + p = strchr(mce_helper, '\n'); + + if (p) + *p = 0; + + return strlen(mce_helper) + !!p; +} + +static ssize_t set_ignore_ce(struct sys_device *s, + struct sysdev_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (strict_strtoull(buf, 0, &new) < 0) + return -EINVAL; + + if (mce_ignore_ce ^ !!new) { + if (new) { + /* disable ce features */ + on_each_cpu(mce_disable_ce, (void *)1, 1); + mce_ignore_ce = 1; + } else { + /* enable ce features */ + mce_ignore_ce = 0; + on_each_cpu(mce_enable_ce, (void *)1, 1); + } + } + return size; +} + +static ssize_t set_cmci_disabled(struct sys_device *s, + struct sysdev_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (strict_strtoull(buf, 0, &new) < 0) + return -EINVAL; + + if (mce_cmci_disabled ^ !!new) { + if (new) { + /* disable cmci */ + on_each_cpu(mce_disable_ce, NULL, 1); + mce_cmci_disabled = 1; + } else { + /* enable cmci */ + mce_cmci_disabled = 0; + on_each_cpu(mce_enable_ce, NULL, 1); + } + } + return size; +} + +static ssize_t store_int_with_restart(struct sys_device *s, + struct sysdev_attribute *attr, + const char *buf, size_t size) +{ + ssize_t ret = sysdev_store_int(s, attr, buf, size); + mce_restart(); + return ret; +} + +static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); +static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); +static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); +static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); + +static struct sysdev_ext_attribute attr_check_interval = { + _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, + store_int_with_restart), + &check_interval +}; + +static struct sysdev_ext_attribute attr_ignore_ce = { + _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), + &mce_ignore_ce +}; + +static struct sysdev_ext_attribute attr_cmci_disabled = { + _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), + &mce_cmci_disabled +}; + +static struct sysdev_attribute *mce_attrs[] = { + &attr_tolerant.attr, + &attr_check_interval.attr, + &attr_trigger, + &attr_monarch_timeout.attr, + &attr_dont_log_ce.attr, + &attr_ignore_ce.attr, + &attr_cmci_disabled.attr, + NULL +}; + +static cpumask_var_t mce_dev_initialized; + +/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ +static __cpuinit int mce_create_device(unsigned int cpu) +{ + int err; + int i, j; + + if (!mce_available(&boot_cpu_data)) + return -EIO; + + memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); + per_cpu(mce_dev, cpu).id = cpu; + per_cpu(mce_dev, cpu).cls = &mce_sysclass; + + err = sysdev_register(&per_cpu(mce_dev, cpu)); + if (err) + return err; + + for (i = 0; mce_attrs[i]; i++) { + err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + if (err) + goto error; + } + for (j = 0; j < banks; j++) { + err = sysdev_create_file(&per_cpu(mce_dev, cpu), + &bank_attrs[j]); + if (err) + goto error2; + } + cpumask_set_cpu(cpu, mce_dev_initialized); + + return 0; +error2: + while (--j >= 0) + sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); +error: + while (--i >= 0) + sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + + sysdev_unregister(&per_cpu(mce_dev, cpu)); + + return err; +} + +static __cpuinit void mce_remove_device(unsigned int cpu) +{ + int i; + + if (!cpumask_test_cpu(cpu, mce_dev_initialized)) + return; + + for (i = 0; mce_attrs[i]; i++) + sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + + for (i = 0; i < banks; i++) + sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + + sysdev_unregister(&per_cpu(mce_dev, cpu)); + cpumask_clear_cpu(cpu, mce_dev_initialized); +} + +/* Make sure there are no machine checks on offlined CPUs. */ +static void mce_disable_cpu(void *h) +{ + unsigned long action = *(unsigned long *)h; + int i; + + if (!mce_available(¤t_cpu_data)) + return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_clear(); + for (i = 0; i < banks; i++) { + if (!skip_bank_init(i)) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + } +} + +static void mce_reenable_cpu(void *h) +{ + unsigned long action = *(unsigned long *)h; + int i; + + if (!mce_available(¤t_cpu_data)) + return; + + if (!(action & CPU_TASKS_FROZEN)) + cmci_reenable(); + for (i = 0; i < banks; i++) { + if (!skip_bank_init(i)) + wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); + } +} + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static int __cpuinit +mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct timer_list *t = &per_cpu(mce_timer, cpu); + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + mce_create_device(cpu); + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); + mce_remove_device(cpu); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + del_timer_sync(t); + smp_call_function_single(cpu, mce_disable_cpu, &action, 1); + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + t->expires = round_jiffies(jiffies + + __get_cpu_var(next_interval)); + add_timer_on(t, cpu); + smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + break; + case CPU_POST_DEAD: + /* intentionally ignoring frozen here */ + cmci_rediscover(cpu); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block mce_cpu_notifier __cpuinitdata = { + .notifier_call = mce_cpu_callback, +}; + +static __init int mce_init_banks(void) +{ + int i; + + bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, + GFP_KERNEL); + if (!bank_attrs) + return -ENOMEM; + + for (i = 0; i < banks; i++) { + struct sysdev_attribute *a = &bank_attrs[i]; + + a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); + if (!a->attr.name) + goto nomem; + + a->attr.mode = 0644; + a->show = show_bank; + a->store = set_bank; + } + return 0; + +nomem: + while (--i >= 0) + kfree(bank_attrs[i].attr.name); + kfree(bank_attrs); + bank_attrs = NULL; + + return -ENOMEM; +} + +static __init int mce_init_device(void) +{ + int err; + int i = 0; + + if (!mce_available(&boot_cpu_data)) + return -EIO; + + zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); + + err = mce_init_banks(); + if (err) + return err; + + err = sysdev_class_register(&mce_sysclass); + if (err) + return err; + + for_each_online_cpu(i) { + err = mce_create_device(i); + if (err) + return err; + } + + register_hotcpu_notifier(&mce_cpu_notifier); + misc_register(&mce_log_device); + + return err; +} + +device_initcall(mce_init_device); + +#else /* CONFIG_X86_OLD_MCE: */ + +int nr_mce_banks; +EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ + +/* This has to be run for each processor */ +void mcheck_init(struct cpuinfo_x86 *c) +{ + if (mce_disabled) + return; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + amd_mcheck_init(c); + break; + + case X86_VENDOR_INTEL: + if (c->x86 == 5) + intel_p5_mcheck_init(c); + if (c->x86 == 6) + intel_p6_mcheck_init(c); + if (c->x86 == 15) + intel_p4_mcheck_init(c); + break; + + case X86_VENDOR_CENTAUR: + if (c->x86 == 5) + winchip_mcheck_init(c); + break; + + default: + break; + } + printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); +} + +static int __init mcheck_enable(char *str) +{ + mce_p5_enabled = 1; + return 1; +} +__setup("mce", mcheck_enable); + +#endif /* CONFIG_X86_OLD_MCE */ + +/* + * Old style boot options parsing. Only for compatibility. + */ +static int __init mcheck_disable(char *str) +{ + mce_disabled = 1; + return 1; +} +__setup("nomce", mcheck_disable); diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h deleted file mode 100644 index ae9f628838f..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ /dev/null @@ -1,14 +0,0 @@ -#include <linux/init.h> -#include <asm/mce.h> - -void amd_mcheck_init(struct cpuinfo_x86 *c); -void intel_p4_mcheck_init(struct cpuinfo_x86 *c); -void intel_p5_mcheck_init(struct cpuinfo_x86 *c); -void intel_p6_mcheck_init(struct cpuinfo_x86 *c); -void winchip_mcheck_init(struct cpuinfo_x86 *c); - -/* Call the installed machine check handler for this CPU setup. */ -extern void (*machine_check_vector)(struct pt_regs *, long error_code); - -extern int nr_mce_banks; - diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c deleted file mode 100644 index 3552119b091..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * mce.c - x86 Machine Check Exception Reporting - * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com> - */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/smp.h> -#include <linux/thread_info.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> - -#include "mce.h" - -int mce_disabled; -int nr_mce_banks; - -EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ - -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; - -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) -{ - if (mce_disabled == 1) - return; - - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - amd_mcheck_init(c); - break; - - case X86_VENDOR_INTEL: - if (c->x86 == 5) - intel_p5_mcheck_init(c); - if (c->x86 == 6) - intel_p6_mcheck_init(c); - if (c->x86 == 15) - intel_p4_mcheck_init(c); - break; - - case X86_VENDOR_CENTAUR: - if (c->x86 == 5) - winchip_mcheck_init(c); - break; - - default: - break; - } -} - -static int __init mcheck_disable(char *str) -{ - mce_disabled = 1; - return 1; -} - -static int __init mcheck_enable(char *str) -{ - mce_disabled = -1; - return 1; -} - -__setup("nomce", mcheck_disable); -__setup("mce", mcheck_enable); diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c deleted file mode 100644 index 09dd1d414fc..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ /dev/null @@ -1,1187 +0,0 @@ -/* - * Machine check handler. - * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. - * Rest from unknown author(s). - * 2004 Andi Kleen. Rewrote most of it. - * Copyright 2008 Intel Corporation - * Author: Andi Kleen - */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/smp_lock.h> -#include <linux/string.h> -#include <linux/rcupdate.h> -#include <linux/kallsyms.h> -#include <linux/sysdev.h> -#include <linux/miscdevice.h> -#include <linux/fs.h> -#include <linux/capability.h> -#include <linux/cpu.h> -#include <linux/percpu.h> -#include <linux/poll.h> -#include <linux/thread_info.h> -#include <linux/ctype.h> -#include <linux/kmod.h> -#include <linux/kdebug.h> -#include <linux/kobject.h> -#include <linux/sysfs.h> -#include <linux/ratelimit.h> -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/mce.h> -#include <asm/uaccess.h> -#include <asm/smp.h> -#include <asm/idle.h> - -#define MISC_MCELOG_MINOR 227 - -atomic_t mce_entry; - -static int mce_dont_init; - -/* - * Tolerant levels: - * 0: always panic on uncorrected errors, log corrected errors - * 1: panic or SIGBUS on uncorrected errors, log corrected errors - * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors - * 3: never panic or SIGBUS, log all errors (for testing only) - */ -static int tolerant = 1; -static int banks; -static u64 *bank; -static unsigned long notify_user; -static int rip_msr; -static int mce_bootlog = -1; -static atomic_t mce_events; - -static char trigger[128]; -static char *trigger_argv[2] = { trigger, NULL }; - -static DECLARE_WAIT_QUEUE_HEAD(mce_wait); - -/* MCA banks polled by the period polling timer for corrected events */ -DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { - [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL -}; - -/* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) -{ - memset(m, 0, sizeof(struct mce)); - m->cpu = smp_processor_id(); - rdtscll(m->tsc); -} - -/* - * Lockless MCE logging infrastructure. - * This avoids deadlocks on printk locks without having to break locks. Also - * separate MCEs from kernel messages to avoid bogus bug reports. - */ - -static struct mce_log mcelog = { - MCE_LOG_SIGNATURE, - MCE_LOG_LEN, -}; - -void mce_log(struct mce *mce) -{ - unsigned next, entry; - atomic_inc(&mce_events); - mce->finished = 0; - wmb(); - for (;;) { - entry = rcu_dereference(mcelog.next); - for (;;) { - /* When the buffer fills up discard new entries. Assume - that the earlier errors are the more interesting. */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); - return; - } - /* Old left over entry. Skip. */ - if (mcelog.entry[entry].finished) { - entry++; - continue; - } - break; - } - smp_rmb(); - next = entry + 1; - if (cmpxchg(&mcelog.next, entry, next) == entry) - break; - } - memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - wmb(); - mcelog.entry[entry].finished = 1; - wmb(); - - set_bit(0, ¬ify_user); -} - -static void print_mce(struct mce *m) -{ - printk(KERN_EMERG "\n" - KERN_EMERG "HARDWARE ERROR\n" - KERN_EMERG - "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", - m->cpu, m->mcgstatus, m->bank, m->status); - if (m->ip) { - printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", - !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->ip); - if (m->cs == __KERNEL_CS) - print_symbol("{%s}", m->ip); - printk("\n"); - } - printk(KERN_EMERG "TSC %llx ", m->tsc); - if (m->addr) - printk("ADDR %llx ", m->addr); - if (m->misc) - printk("MISC %llx ", m->misc); - printk("\n"); - printk(KERN_EMERG "This is not a software problem!\n"); - printk(KERN_EMERG "Run through mcelog --ascii to decode " - "and contact your hardware vendor\n"); -} - -static void mce_panic(char *msg, struct mce *backup, unsigned long start) -{ - int i; - - oops_begin(); - for (i = 0; i < MCE_LOG_LEN; i++) { - unsigned long tsc = mcelog.entry[i].tsc; - - if (time_before(tsc, start)) - continue; - print_mce(&mcelog.entry[i]); - if (backup && mcelog.entry[i].tsc == backup->tsc) - backup = NULL; - } - if (backup) - print_mce(backup); - panic(msg); -} - -int mce_available(struct cpuinfo_x86 *c) -{ - if (mce_dont_init) - return 0; - return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); -} - -static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) -{ - if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { - m->ip = regs->ip; - m->cs = regs->cs; - } else { - m->ip = 0; - m->cs = 0; - } - if (rip_msr) { - /* Assume the RIP in the MSR is exact. Is this true? */ - m->mcgstatus |= MCG_STATUS_EIPV; - rdmsrl(rip_msr, m->ip); - m->cs = 0; - } -} - -/* - * Poll for corrected events or events that happened before reset. - * Those are just logged through /dev/mcelog. - * - * This is executed in standard interrupt context. - */ -void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) -{ - struct mce m; - int i; - - mce_setup(&m); - - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); - for (i = 0; i < banks; i++) { - if (!bank[i] || !test_bit(i, *b)) - continue; - - m.misc = 0; - m.addr = 0; - m.bank = i; - m.tsc = 0; - - barrier(); - rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); - if (!(m.status & MCI_STATUS_VAL)) - continue; - - /* - * Uncorrected events are handled by the exception handler - * when it is enabled. But when the exception is disabled log - * everything. - * - * TBD do the same check for MCI_STATUS_EN here? - */ - if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) - continue; - - if (m.status & MCI_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); - if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); - - if (!(flags & MCP_TIMESTAMP)) - m.tsc = 0; - /* - * Don't get the IP here because it's unlikely to - * have anything to do with the actual error location. - */ - if (!(flags & MCP_DONTLOG)) { - mce_log(&m); - add_taint(TAINT_MACHINE_CHECK); - } - - /* - * Clear state for this bank. - */ - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } - - /* - * Don't clear MCG_STATUS here because it's only defined for - * exceptions. - */ -} - -/* - * The actual machine check handler. This only handles real - * exceptions when something got corrupted coming in through int 18. - * - * This is executed in NMI context not subject to normal locking rules. This - * implies that most kernel services cannot be safely used. Don't even - * think about putting a printk in there! - */ -void do_machine_check(struct pt_regs * regs, long error_code) -{ - struct mce m, panicm; - u64 mcestart = 0; - int i; - int panicm_found = 0; - /* - * If no_way_out gets set, there is no safe way to recover from this - * MCE. If tolerant is cranked up, we'll try anyway. - */ - int no_way_out = 0; - /* - * If kill_it gets set, there might be a way to recover from this - * error. - */ - int kill_it = 0; - DECLARE_BITMAP(toclear, MAX_NR_BANKS); - - atomic_inc(&mce_entry); - - if (notify_die(DIE_NMI, "machine check", regs, error_code, - 18, SIGKILL) == NOTIFY_STOP) - goto out2; - if (!banks) - goto out2; - - mce_setup(&m); - - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); - /* if the restart IP is not valid, we're done for */ - if (!(m.mcgstatus & MCG_STATUS_RIPV)) - no_way_out = 1; - - rdtscll(mcestart); - barrier(); - - for (i = 0; i < banks; i++) { - __clear_bit(i, toclear); - if (!bank[i]) - continue; - - m.misc = 0; - m.addr = 0; - m.bank = i; - - rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); - if ((m.status & MCI_STATUS_VAL) == 0) - continue; - - /* - * Non uncorrected errors are handled by machine_check_poll - * Leave them alone. - */ - if ((m.status & MCI_STATUS_UC) == 0) - continue; - - /* - * Set taint even when machine check was not enabled. - */ - add_taint(TAINT_MACHINE_CHECK); - - __set_bit(i, toclear); - - if (m.status & MCI_STATUS_EN) { - /* if PCC was set, there's no way out */ - no_way_out |= !!(m.status & MCI_STATUS_PCC); - /* - * If this error was uncorrectable and there was - * an overflow, we're in trouble. If no overflow, - * we might get away with just killing a task. - */ - if (m.status & MCI_STATUS_UC) { - if (tolerant < 1 || m.status & MCI_STATUS_OVER) - no_way_out = 1; - kill_it = 1; - } - } else { - /* - * Machine check event was not enabled. Clear, but - * ignore. - */ - continue; - } - - if (m.status & MCI_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); - if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); - - mce_get_rip(&m, regs); - mce_log(&m); - - /* Did this bank cause the exception? */ - /* Assume that the bank with uncorrectable errors did it, - and that there is only a single one. */ - if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { - panicm = m; - panicm_found = 1; - } - } - - /* If we didn't find an uncorrectable error, pick - the last one (shouldn't happen, just being safe). */ - if (!panicm_found) - panicm = m; - - /* - * If we have decided that we just CAN'T continue, and the user - * has not set tolerant to an insane level, give up and die. - */ - if (no_way_out && tolerant < 3) - mce_panic("Machine check", &panicm, mcestart); - - /* - * If the error seems to be unrecoverable, something should be - * done. Try to kill as little as possible. If we can kill just - * one task, do that. If the user has set the tolerance very - * high, don't try to do anything at all. - */ - if (kill_it && tolerant < 3) { - int user_space = 0; - - /* - * If the EIPV bit is set, it means the saved IP is the - * instruction which caused the MCE. - */ - if (m.mcgstatus & MCG_STATUS_EIPV) - user_space = panicm.ip && (panicm.cs & 3); - - /* - * If we know that the error was in user space, send a - * SIGBUS. Otherwise, panic if tolerance is low. - * - * force_sig() takes an awful lot of locks and has a slight - * risk of deadlocking. - */ - if (user_space) { - force_sig(SIGBUS, current); - } else if (panic_on_oops || tolerant < 2) { - mce_panic("Uncorrected machine check", - &panicm, mcestart); - } - } - - /* notify userspace ASAP */ - set_thread_flag(TIF_MCE_NOTIFY); - - /* the last thing we do is clear state */ - for (i = 0; i < banks; i++) { - if (test_bit(i, toclear)) - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } - wrmsrl(MSR_IA32_MCG_STATUS, 0); - out2: - atomic_dec(&mce_entry); -} - -#ifdef CONFIG_X86_MCE_INTEL -/*** - * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog - * @cpu: The CPU on which the event occurred. - * @status: Event status information - * - * This function should be called by the thermal interrupt after the - * event has been processed and the decision was made to log the event - * further. - * - * The status parameter will be saved to the 'status' field of 'struct mce' - * and historically has been the register value of the - * MSR_IA32_THERMAL_STATUS (Intel) msr. - */ -void mce_log_therm_throt_event(__u64 status) -{ - struct mce m; - - mce_setup(&m); - m.bank = MCE_THERMAL_BANK; - m.status = status; - mce_log(&m); -} -#endif /* CONFIG_X86_MCE_INTEL */ - -/* - * Periodic polling timer for "silent" machine check errors. If the - * poller finds an MCE, poll 2x faster. When the poller finds no more - * errors, poll 2x slower (up to check_interval seconds). - */ - -static int check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ -static void mcheck_timer(unsigned long); -static DEFINE_PER_CPU(struct timer_list, mce_timer); - -static void mcheck_timer(unsigned long data) -{ - struct timer_list *t = &per_cpu(mce_timer, data); - int *n; - - WARN_ON(smp_processor_id() != data); - - if (mce_available(¤t_cpu_data)) - machine_check_poll(MCP_TIMESTAMP, - &__get_cpu_var(mce_poll_banks)); - - /* - * Alert userspace if needed. If we logged an MCE, reduce the - * polling interval, otherwise increase the polling interval. - */ - n = &__get_cpu_var(next_interval); - if (mce_notify_user()) { - *n = max(*n/2, HZ/100); - } else { - *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); - } - - t->expires = jiffies + *n; - add_timer(t); -} - -static void mce_do_trigger(struct work_struct *work) -{ - call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); -} - -static DECLARE_WORK(mce_trigger_work, mce_do_trigger); - -/* - * Notify the user(s) about new machine check events. - * Can be called from interrupt context, but not from machine check/NMI - * context. - */ -int mce_notify_user(void) -{ - /* Not more than two messages every minute */ - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - - clear_thread_flag(TIF_MCE_NOTIFY); - if (test_and_clear_bit(0, ¬ify_user)) { - wake_up_interruptible(&mce_wait); - - /* - * There is no risk of missing notifications because - * work_pending is always cleared before the function is - * executed. - */ - if (trigger[0] && !work_pending(&mce_trigger_work)) - schedule_work(&mce_trigger_work); - - if (__ratelimit(&ratelimit)) - printk(KERN_INFO "Machine check events logged\n"); - - return 1; - } - return 0; -} - -/* see if the idle task needs to notify userspace */ -static int -mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) -{ - /* IDLE_END should be safe - interrupts are back on */ - if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) - mce_notify_user(); - - return NOTIFY_OK; -} - -static struct notifier_block mce_idle_notifier = { - .notifier_call = mce_idle_callback, -}; - -static __init int periodic_mcheck_init(void) -{ - idle_notifier_register(&mce_idle_notifier); - return 0; -} -__initcall(periodic_mcheck_init); - -/* - * Initialize Machine Checks for a CPU. - */ -static int mce_cap_init(void) -{ - u64 cap; - unsigned b; - - rdmsrl(MSR_IA32_MCG_CAP, cap); - b = cap & 0xff; - if (b > MAX_NR_BANKS) { - printk(KERN_WARNING - "MCE: Using only %u machine check banks out of %u\n", - MAX_NR_BANKS, b); - b = MAX_NR_BANKS; - } - - /* Don't support asymmetric configurations today */ - WARN_ON(banks != 0 && b != banks); - banks = b; - if (!bank) { - bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); - if (!bank) - return -ENOMEM; - memset(bank, 0xff, banks * sizeof(u64)); - } - - /* Use accurate RIP reporting if available. */ - if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) - rip_msr = MSR_IA32_MCG_EIP; - - return 0; -} - -static void mce_init(void *dummy) -{ - u64 cap; - int i; - mce_banks_t all_banks; - - /* - * Log the machine checks left over from the previous reset. - */ - bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); - - set_in_cr4(X86_CR4_MCE); - - rdmsrl(MSR_IA32_MCG_CAP, cap); - if (cap & MCG_CTL_P) - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - - for (i = 0; i < banks; i++) { - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } -} - -/* Add per CPU specific workarounds here */ -static void mce_cpu_quirks(struct cpuinfo_x86 *c) -{ - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && banks > 4) - /* disable GART TBL walk error reporting, which trips off - incorrectly with the IOMMU & 3ware & Cerberus. */ - clear_bit(10, (unsigned long *)&bank[4]); - if(c->x86 <= 17 && mce_bootlog < 0) - /* Lots of broken BIOS around that don't clear them - by default and leave crap in there. Don't log. */ - mce_bootlog = 0; - } - -} - -static void mce_cpu_features(struct cpuinfo_x86 *c) -{ - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - mce_intel_feature_init(c); - break; - case X86_VENDOR_AMD: - mce_amd_feature_init(c); - break; - default: - break; - } -} - -static void mce_init_timer(void) -{ - struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(next_interval); - - *n = check_interval * HZ; - if (!*n) - return; - setup_timer(t, mcheck_timer, smp_processor_id()); - t->expires = round_jiffies(jiffies + *n); - add_timer(t); -} - -/* - * Called for each booted CPU to set up machine checks. - * Must be called with preempt off. - */ -void __cpuinit mcheck_init(struct cpuinfo_x86 *c) -{ - if (!mce_available(c)) - return; - - if (mce_cap_init() < 0) { - mce_dont_init = 1; - return; - } - mce_cpu_quirks(c); - - mce_init(NULL); - mce_cpu_features(c); - mce_init_timer(); -} - -/* - * Character device to read and clear the MCE log. - */ - -static DEFINE_SPINLOCK(mce_state_lock); -static int open_count; /* #times opened */ -static int open_exclu; /* already open exclusive? */ - -static int mce_open(struct inode *inode, struct file *file) -{ - lock_kernel(); - spin_lock(&mce_state_lock); - - if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { - spin_unlock(&mce_state_lock); - unlock_kernel(); - return -EBUSY; - } - - if (file->f_flags & O_EXCL) - open_exclu = 1; - open_count++; - - spin_unlock(&mce_state_lock); - unlock_kernel(); - - return nonseekable_open(inode, file); -} - -static int mce_release(struct inode *inode, struct file *file) -{ - spin_lock(&mce_state_lock); - - open_count--; - open_exclu = 0; - - spin_unlock(&mce_state_lock); - - return 0; -} - -static void collect_tscs(void *data) -{ - unsigned long *cpu_tsc = (unsigned long *)data; - - rdtscll(cpu_tsc[smp_processor_id()]); -} - -static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, - loff_t *off) -{ - unsigned long *cpu_tsc; - static DEFINE_MUTEX(mce_read_mutex); - unsigned prev, next; - char __user *buf = ubuf; - int i, err; - - cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); - if (!cpu_tsc) - return -ENOMEM; - - mutex_lock(&mce_read_mutex); - next = rcu_dereference(mcelog.next); - - /* Only supports full reads right now */ - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { - mutex_unlock(&mce_read_mutex); - kfree(cpu_tsc); - return -EINVAL; - } - - err = 0; - prev = 0; - do { - for (i = prev; i < next; i++) { - unsigned long start = jiffies; - - while (!mcelog.entry[i].finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(mcelog.entry + i, 0, - sizeof(struct mce)); - goto timeout; - } - cpu_relax(); - } - smp_rmb(); - err |= copy_to_user(buf, mcelog.entry + i, - sizeof(struct mce)); - buf += sizeof(struct mce); -timeout: - ; - } - - memset(mcelog.entry + prev, 0, - (next - prev) * sizeof(struct mce)); - prev = next; - next = cmpxchg(&mcelog.next, prev, 0); - } while (next != prev); - - synchronize_sched(); - - /* - * Collect entries that were still getting written before the - * synchronize. - */ - on_each_cpu(collect_tscs, cpu_tsc, 1); - for (i = next; i < MCE_LOG_LEN; i++) { - if (mcelog.entry[i].finished && - mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { - err |= copy_to_user(buf, mcelog.entry+i, - sizeof(struct mce)); - smp_rmb(); - buf += sizeof(struct mce); - memset(&mcelog.entry[i], 0, sizeof(struct mce)); - } - } - mutex_unlock(&mce_read_mutex); - kfree(cpu_tsc); - return err ? -EFAULT : buf - ubuf; -} - -static unsigned int mce_poll(struct file *file, poll_table *wait) -{ - poll_wait(file, &mce_wait, wait); - if (rcu_dereference(mcelog.next)) - return POLLIN | POLLRDNORM; - return 0; -} - -static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) -{ - int __user *p = (int __user *)arg; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - switch (cmd) { - case MCE_GET_RECORD_LEN: - return put_user(sizeof(struct mce), p); - case MCE_GET_LOG_LEN: - return put_user(MCE_LOG_LEN, p); - case MCE_GETCLEAR_FLAGS: { - unsigned flags; - - do { - flags = mcelog.flags; - } while (cmpxchg(&mcelog.flags, flags, 0) != flags); - return put_user(flags, p); - } - default: - return -ENOTTY; - } -} - -static const struct file_operations mce_chrdev_ops = { - .open = mce_open, - .release = mce_release, - .read = mce_read, - .poll = mce_poll, - .unlocked_ioctl = mce_ioctl, -}; - -static struct miscdevice mce_log_device = { - MISC_MCELOG_MINOR, - "mcelog", - &mce_chrdev_ops, -}; - -/* - * Old style boot options parsing. Only for compatibility. - */ -static int __init mcheck_disable(char *str) -{ - mce_dont_init = 1; - return 1; -} - -/* mce=off disables machine check. - mce=TOLERANCELEVEL (number, see above) - mce=bootlog Log MCEs from before booting. Disabled by default on AMD. - mce=nobootlog Don't log MCEs from before booting. */ -static int __init mcheck_enable(char *str) -{ - if (!strcmp(str, "off")) - mce_dont_init = 1; - else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) - mce_bootlog = str[0] == 'b'; - else if (isdigit(str[0])) - get_option(&str, &tolerant); - else - printk("mce= argument %s ignored. Please use /sys", str); - return 1; -} - -__setup("nomce", mcheck_disable); -__setup("mce=", mcheck_enable); - -/* - * Sysfs support - */ - -/* - * Disable machine checks on suspend and shutdown. We can't really handle - * them later. - */ -static int mce_disable(void) -{ - int i; - - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); - return 0; -} - -static int mce_suspend(struct sys_device *dev, pm_message_t state) -{ - return mce_disable(); -} - -static int mce_shutdown(struct sys_device *dev) -{ - return mce_disable(); -} - -/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. - Only one CPU is active at this time, the others get readded later using - CPU hotplug. */ -static int mce_resume(struct sys_device *dev) -{ - mce_init(NULL); - mce_cpu_features(¤t_cpu_data); - return 0; -} - -static void mce_cpu_restart(void *data) -{ - del_timer_sync(&__get_cpu_var(mce_timer)); - if (mce_available(¤t_cpu_data)) - mce_init(NULL); - mce_init_timer(); -} - -/* Reinit MCEs after user configuration changes */ -static void mce_restart(void) -{ - on_each_cpu(mce_cpu_restart, NULL, 1); -} - -static struct sysdev_class mce_sysclass = { - .suspend = mce_suspend, - .shutdown = mce_shutdown, - .resume = mce_resume, - .name = "machinecheck", -}; - -DEFINE_PER_CPU(struct sys_device, device_mce); -void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata; - -/* Why are there no generic functions for this? */ -#define ACCESSOR(name, var, start) \ - static ssize_t show_ ## name(struct sys_device *s, \ - struct sysdev_attribute *attr, \ - char *buf) { \ - return sprintf(buf, "%lx\n", (unsigned long)var); \ - } \ - static ssize_t set_ ## name(struct sys_device *s, \ - struct sysdev_attribute *attr, \ - const char *buf, size_t siz) { \ - char *end; \ - unsigned long new = simple_strtoul(buf, &end, 0); \ - if (end == buf) return -EINVAL; \ - var = new; \ - start; \ - return end-buf; \ - } \ - static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); - -static struct sysdev_attribute *bank_attrs; - -static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, - char *buf) -{ - u64 b = bank[attr - bank_attrs]; - return sprintf(buf, "%llx\n", b); -} - -static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, - const char *buf, size_t siz) -{ - char *end; - u64 new = simple_strtoull(buf, &end, 0); - if (end == buf) - return -EINVAL; - bank[attr - bank_attrs] = new; - mce_restart(); - return end-buf; -} - -static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, - char *buf) -{ - strcpy(buf, trigger); - strcat(buf, "\n"); - return strlen(trigger) + 1; -} - -static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, - const char *buf,size_t siz) -{ - char *p; - int len; - strncpy(trigger, buf, sizeof(trigger)); - trigger[sizeof(trigger)-1] = 0; - len = strlen(trigger); - p = strchr(trigger, '\n'); - if (*p) *p = 0; - return len; -} - -static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -ACCESSOR(check_interval,check_interval,mce_restart()) -static struct sysdev_attribute *mce_attributes[] = { - &attr_tolerant.attr, &attr_check_interval, &attr_trigger, - NULL -}; - -static cpumask_var_t mce_device_initialized; - -/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ -static __cpuinit int mce_create_device(unsigned int cpu) -{ - int err; - int i; - - if (!mce_available(&boot_cpu_data)) - return -EIO; - - memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); - per_cpu(device_mce,cpu).id = cpu; - per_cpu(device_mce,cpu).cls = &mce_sysclass; - - err = sysdev_register(&per_cpu(device_mce,cpu)); - if (err) - return err; - - for (i = 0; mce_attributes[i]; i++) { - err = sysdev_create_file(&per_cpu(device_mce,cpu), - mce_attributes[i]); - if (err) - goto error; - } - for (i = 0; i < banks; i++) { - err = sysdev_create_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - if (err) - goto error2; - } - cpumask_set_cpu(cpu, mce_device_initialized); - - return 0; -error2: - while (--i >= 0) { - sysdev_remove_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - } -error: - while (--i >= 0) { - sysdev_remove_file(&per_cpu(device_mce,cpu), - mce_attributes[i]); - } - sysdev_unregister(&per_cpu(device_mce,cpu)); - - return err; -} - -static __cpuinit void mce_remove_device(unsigned int cpu) -{ - int i; - - if (!cpumask_test_cpu(cpu, mce_device_initialized)) - return; - - for (i = 0; mce_attributes[i]; i++) - sysdev_remove_file(&per_cpu(device_mce,cpu), - mce_attributes[i]); - for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - sysdev_unregister(&per_cpu(device_mce,cpu)); - cpumask_clear_cpu(cpu, mce_device_initialized); -} - -/* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) -{ - int i; - unsigned long action = *(unsigned long *)h; - - if (!mce_available(¤t_cpu_data)) - return; - if (!(action & CPU_TASKS_FROZEN)) - cmci_clear(); - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); -} - -static void mce_reenable_cpu(void *h) -{ - int i; - unsigned long action = *(unsigned long *)h; - - if (!mce_available(¤t_cpu_data)) - return; - if (!(action & CPU_TASKS_FROZEN)) - cmci_reenable(); - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); -} - -/* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct timer_list *t = &per_cpu(mce_timer, cpu); - - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - mce_create_device(cpu); - if (threshold_cpu_callback) - threshold_cpu_callback(action, cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - if (threshold_cpu_callback) - threshold_cpu_callback(action, cpu); - mce_remove_device(cpu); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - del_timer_sync(t); - smp_call_function_single(cpu, mce_disable_cpu, &action, 1); - break; - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - t->expires = round_jiffies(jiffies + - __get_cpu_var(next_interval)); - add_timer_on(t, cpu); - smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); - break; - case CPU_POST_DEAD: - /* intentionally ignoring frozen here */ - cmci_rediscover(cpu); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block mce_cpu_notifier __cpuinitdata = { - .notifier_call = mce_cpu_callback, -}; - -static __init int mce_init_banks(void) -{ - int i; - - bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, - GFP_KERNEL); - if (!bank_attrs) - return -ENOMEM; - - for (i = 0; i < banks; i++) { - struct sysdev_attribute *a = &bank_attrs[i]; - a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); - if (!a->attr.name) - goto nomem; - a->attr.mode = 0644; - a->show = show_bank; - a->store = set_bank; - } - return 0; - -nomem: - while (--i >= 0) - kfree(bank_attrs[i].attr.name); - kfree(bank_attrs); - bank_attrs = NULL; - return -ENOMEM; -} - -static __init int mce_init_device(void) -{ - int err; - int i = 0; - - if (!mce_available(&boot_cpu_data)) - return -EIO; - - zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); - - err = mce_init_banks(); - if (err) - return err; - - err = sysdev_class_register(&mce_sysclass); - if (err) - return err; - - for_each_online_cpu(i) { - err = mce_create_device(i); - if (err) - return err; - } - - register_hotcpu_notifier(&mce_cpu_notifier); - misc_register(&mce_log_device); - return err; -} - -device_initcall(mce_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 56dde9c4bc9..ddae21620bd 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -13,22 +13,22 @@ * * All MC4_MISCi registers are shared between multi-cores */ - -#include <linux/cpu.h> -#include <linux/errno.h> -#include <linux/init.h> #include <linux/interrupt.h> -#include <linux/kobject.h> #include <linux/notifier.h> -#include <linux/sched.h> -#include <linux/smp.h> +#include <linux/kobject.h> +#include <linux/percpu.h> #include <linux/sysdev.h> +#include <linux/errno.h> +#include <linux/sched.h> #include <linux/sysfs.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/smp.h> + #include <asm/apic.h> +#include <asm/idle.h> #include <asm/mce.h> #include <asm/msr.h> -#include <asm/percpu.h> -#include <asm/idle.h> #define PFX "mce_threshold: " #define VERSION "version 1.1.1" @@ -48,26 +48,26 @@ #define MCG_XBLK_ADDR 0xC0000400 struct threshold_block { - unsigned int block; - unsigned int bank; - unsigned int cpu; - u32 address; - u16 interrupt_enable; - u16 threshold_limit; - struct kobject kobj; - struct list_head miscj; + unsigned int block; + unsigned int bank; + unsigned int cpu; + u32 address; + u16 interrupt_enable; + u16 threshold_limit; + struct kobject kobj; + struct list_head miscj; }; /* defaults used early on boot */ static struct threshold_block threshold_defaults = { - .interrupt_enable = 0, - .threshold_limit = THRESHOLD_MAX, + .interrupt_enable = 0, + .threshold_limit = THRESHOLD_MAX, }; struct threshold_bank { - struct kobject *kobj; - struct threshold_block *blocks; - cpumask_var_t cpus; + struct kobject *kobj; + struct threshold_block *blocks; + cpumask_var_t cpus; }; static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); @@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void); */ struct thresh_restart { - struct threshold_block *b; - int reset; - u16 old_limit; + struct threshold_block *b; + int reset; + u16 old_limit; }; /* must be called with correct cpu affinity */ @@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr) } else if (tr->old_limit) { /* change limit w/o reset */ int new_count = (mci_misc_hi & THRESHOLD_MAX) + (tr->old_limit - tr->b->threshold_limit); + mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | (new_count & THRESHOLD_MAX); } @@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr) /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { - unsigned int bank, block; unsigned int cpu = smp_processor_id(); - u8 lvt_off; u32 low = 0, high = 0, address = 0; + unsigned int bank, block; struct thresh_restart tr; + u8 lvt_off; for (bank = 0; bank < NR_BANKS; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { @@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!address) break; address += MCG_XBLK_ADDR; - } - else + } else ++address; if (rdmsr_safe(address, &low, &high)) @@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) */ static void amd_threshold_interrupt(void) { + u32 low = 0, high = 0, address = 0; unsigned int bank, block; struct mce m; - u32 low = 0, high = 0, address = 0; mce_setup(&m); @@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void) if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { - if (block == 0) + if (block == 0) { address = MSR_IA32_MC0_MISC + bank * 4; - else if (block == 1) { + } else if (block == 1) { address = (low & MASK_BLKPTR_LO) >> 21; if (!address) break; address += MCG_XBLK_ADDR; - } - else + } else { ++address; + } if (rdmsr_safe(address, &low, &high)) break; @@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void) (high & MASK_LOCKED_HI)) continue; - /* Log the machine check that caused the threshold - event. */ + /* + * Log the machine check that caused the threshold + * event. + */ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); @@ -254,48 +256,52 @@ static void amd_threshold_interrupt(void) struct threshold_attr { struct attribute attr; - ssize_t(*show) (struct threshold_block *, char *); - ssize_t(*store) (struct threshold_block *, const char *, size_t count); + ssize_t (*show) (struct threshold_block *, char *); + ssize_t (*store) (struct threshold_block *, const char *, size_t count); }; -#define SHOW_FIELDS(name) \ -static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ -{ \ - return sprintf(buf, "%lx\n", (unsigned long) b->name); \ +#define SHOW_FIELDS(name) \ +static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ +{ \ + return sprintf(buf, "%lx\n", (unsigned long) b->name); \ } SHOW_FIELDS(interrupt_enable) SHOW_FIELDS(threshold_limit) -static ssize_t store_interrupt_enable(struct threshold_block *b, - const char *buf, size_t count) +static ssize_t +store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) { - char *end; struct thresh_restart tr; - unsigned long new = simple_strtoul(buf, &end, 0); - if (end == buf) + unsigned long new; + + if (strict_strtoul(buf, 0, &new) < 0) return -EINVAL; + b->interrupt_enable = !!new; - tr.b = b; - tr.reset = 0; - tr.old_limit = 0; + tr.b = b; + tr.reset = 0; + tr.old_limit = 0; + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - return end - buf; + return size; } -static ssize_t store_threshold_limit(struct threshold_block *b, - const char *buf, size_t count) +static ssize_t +store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) { - char *end; struct thresh_restart tr; - unsigned long new = simple_strtoul(buf, &end, 0); - if (end == buf) + unsigned long new; + + if (strict_strtoul(buf, 0, &new) < 0) return -EINVAL; + if (new > THRESHOLD_MAX) new = THRESHOLD_MAX; if (new < 1) new = 1; + tr.old_limit = b->threshold_limit; b->threshold_limit = new; tr.b = b; @@ -303,12 +309,12 @@ static ssize_t store_threshold_limit(struct threshold_block *b, smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - return end - buf; + return size; } struct threshold_block_cross_cpu { - struct threshold_block *tb; - long retval; + struct threshold_block *tb; + long retval; }; static void local_error_count_handler(void *_tbcc) @@ -338,16 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b, return 1; } -#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ +#define RW_ATTR(val) \ +static struct threshold_attr val = { \ + .attr = {.name = __stringify(val), .mode = 0644 }, \ + .show = show_## val, \ + .store = store_## val, \ }; -#define RW_ATTR(name) \ -static struct threshold_attr name = \ - THRESHOLD_ATTR(name, 0644, show_## name, store_## name) - RW_ATTR(interrupt_enable); RW_ATTR(threshold_limit); RW_ATTR(error_count); @@ -359,15 +362,17 @@ static struct attribute *default_attrs[] = { NULL }; -#define to_block(k) container_of(k, struct threshold_block, kobj) -#define to_attr(a) container_of(a, struct threshold_attr, attr) +#define to_block(k) container_of(k, struct threshold_block, kobj) +#define to_attr(a) container_of(a, struct threshold_attr, attr) static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { struct threshold_block *b = to_block(kobj); struct threshold_attr *a = to_attr(attr); ssize_t ret; + ret = a->show ? a->show(b, buf) : -EIO; + return ret; } @@ -377,18 +382,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, struct threshold_block *b = to_block(kobj); struct threshold_attr *a = to_attr(attr); ssize_t ret; + ret = a->store ? a->store(b, buf, count) : -EIO; + return ret; } static struct sysfs_ops threshold_ops = { - .show = show, - .store = store, + .show = show, + .store = store, }; static struct kobj_type threshold_ktype = { - .sysfs_ops = &threshold_ops, - .default_attrs = default_attrs, + .sysfs_ops = &threshold_ops, + .default_attrs = default_attrs, }; static __cpuinit int allocate_threshold_blocks(unsigned int cpu, @@ -396,9 +403,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, unsigned int block, u32 address) { - int err; - u32 low, high; struct threshold_block *b = NULL; + u32 low, high; + int err; if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) return 0; @@ -421,20 +428,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, if (!b) return -ENOMEM; - b->block = block; - b->bank = bank; - b->cpu = cpu; - b->address = address; - b->interrupt_enable = 0; - b->threshold_limit = THRESHOLD_MAX; + b->block = block; + b->bank = bank; + b->cpu = cpu; + b->address = address; + b->interrupt_enable = 0; + b->threshold_limit = THRESHOLD_MAX; INIT_LIST_HEAD(&b->miscj); - if (per_cpu(threshold_banks, cpu)[bank]->blocks) + if (per_cpu(threshold_banks, cpu)[bank]->blocks) { list_add(&b->miscj, &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); - else + } else { per_cpu(threshold_banks, cpu)[bank]->blocks = b; + } err = kobject_init_and_add(&b->kobj, &threshold_ktype, per_cpu(threshold_banks, cpu)[bank]->kobj, @@ -447,8 +455,9 @@ recurse: if (!address) return 0; address += MCG_XBLK_ADDR; - } else + } else { ++address; + } err = allocate_threshold_blocks(cpu, bank, ++block, address); if (err) @@ -500,13 +509,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (!b) goto out; - err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, + err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj, b->kobj, name); if (err) goto out; cpumask_copy(b->cpus, cpu_core_mask(cpu)); per_cpu(threshold_banks, cpu)[bank] = b; + goto out; } #endif @@ -522,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } - b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); + b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj); if (!b->kobj) goto out_free; @@ -542,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - err = sysfs_create_link(&per_cpu(device_mce, i).kobj, + err = sysfs_create_link(&per_cpu(mce_dev, i).kobj, b->kobj, name); if (err) goto out; @@ -605,15 +615,13 @@ static void deallocate_threshold_block(unsigned int cpu, static void threshold_remove_bank(unsigned int cpu, int bank) { - int i = 0; struct threshold_bank *b; char name[32]; + int i = 0; b = per_cpu(threshold_banks, cpu)[bank]; - if (!b) return; - if (!b->blocks) goto free_out; @@ -622,8 +630,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); + sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; + return; } #endif @@ -633,7 +642,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); + sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; } @@ -659,12 +668,9 @@ static void threshold_remove_device(unsigned int cpu) } /* get notified when a cpu comes on/off */ -static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, - unsigned int cpu) +static void __cpuinit +amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) { - if (cpu >= NR_CPUS) - return; - switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: @@ -686,11 +692,12 @@ static __init int threshold_init_device(void) /* to hit CPUs online before the notifier is up */ for_each_online_cpu(lcpu) { int err = threshold_create_device(lcpu); + if (err) return err; } threshold_cpu_callback = amd_64_threshold_cpu_callback; + return 0; } - device_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index cef3ee30744..e1acec0f7a3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -8,85 +8,10 @@ #include <linux/init.h> #include <linux/interrupt.h> #include <linux/percpu.h> -#include <asm/processor.h> #include <asm/apic.h> +#include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> -#include <asm/hw_irq.h> -#include <asm/idle.h> -#include <asm/therm_throt.h> -#include <asm/apic.h> - -asmlinkage void smp_thermal_interrupt(void) -{ - __u64 msr_val; - - ack_APIC_irq(); - - exit_idle(); - irq_enter(); - - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & 1)) - mce_log_therm_throt_event(msr_val); - - inc_irq_stat(irq_thermal_count); - irq_exit(); -} - -static void intel_init_thermal(struct cpuinfo_x86 *c) -{ - u32 l, h; - int tm2 = 0; - unsigned int cpu = smp_processor_id(); - - if (!cpu_has(c, X86_FEATURE_ACPI)) - return; - - if (!cpu_has(c, X86_FEATURE_ACC)) - return; - - /* first check if TM1 is already enabled by the BIOS, in which - * case there might be some SMM goo which handles it, so we can't even - * put a handler since it might be delivered via SMI already. - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); - if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG - "CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already " - "installed\n", cpu, (h & APIC_VECTOR_MASK)); - return; - } - - h = THERMAL_APIC_VECTOR; - h |= (APIC_DM_FIXED | APIC_LVT_MASKED); - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); - return; -} /* * Support for Intel Correct Machine Check Interrupts. This allows @@ -109,6 +34,9 @@ static int cmci_supported(int *banks) { u64 cap; + if (mce_cmci_disabled || mce_ignore_ce) + return 0; + /* * Vendor check is not strictly needed, but the initial * initialization is vendor keyed and this @@ -132,7 +60,7 @@ static int cmci_supported(int *banks) static void intel_threshold_interrupt(void) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); - mce_notify_user(); + mce_notify_irq(); } static void print_update(char *type, int *hdr, int num) @@ -248,7 +176,7 @@ void cmci_rediscover(int dying) return; cpumask_copy(old, ¤t->cpus_allowed); - for_each_online_cpu (cpu) { + for_each_online_cpu(cpu) { if (cpu == dying) continue; if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index a74af128efc..f5f2d6f71fb 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c @@ -6,25 +6,23 @@ * This file contains routines to check for non-fatal MCEs every 15s * */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/jiffies.h> -#include <linux/workqueue.h> #include <linux/interrupt.h> -#include <linux/smp.h> +#include <linux/workqueue.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> #include <linux/module.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/smp.h> #include <asm/processor.h> #include <asm/system.h> +#include <asm/mce.h> #include <asm/msr.h> -#include "mce.h" +static int firstbank; -static int firstbank; - -#define MCE_RATE 15*HZ /* timer rate is 15s */ +#define MCE_RATE (15*HZ) /* timer rate is 15s */ static void mce_checkregs(void *info) { @@ -34,23 +32,24 @@ static void mce_checkregs(void *info) for (i = firstbank; i < nr_mce_banks; i++) { rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - printk(KERN_INFO "MCE: The hardware reports a non " - "fatal, correctable incident occurred on " - "CPU %d.\n", + if (!(high & (1<<31))) + continue; + + printk(KERN_INFO "MCE: The hardware reports a non fatal, " + "correctable incident occurred on CPU %d.\n", smp_processor_id()); - printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); - - /* - * Scrub the error so we don't pick it up in MCE_RATE - * seconds time. - */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - - /* Serialize */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } + + printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); + + /* + * Scrub the error so we don't pick it up in MCE_RATE + * seconds time: + */ + wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); + + /* Serialize: */ + wmb(); + add_taint(TAINT_MACHINE_CHECK); } } @@ -77,16 +76,17 @@ static int __init init_nonfatal_mce_checker(void) /* Some Athlons misbehave when we frob bank 0 */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 6) - firstbank = 1; + boot_cpu_data.x86 == 6) + firstbank = 1; else - firstbank = 0; + firstbank = 0; /* * Check for non-fatal errors every MCE_RATE s */ schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); printk(KERN_INFO "Machine check exception polling timer started.\n"); + return 0; } module_init(init_nonfatal_mce_checker); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index f53bdcbaf38..4482aea9aa2 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -1,21 +1,14 @@ /* * P4 specific Machine Check Exception Reporting */ - -#include <linux/init.h> -#include <linux/types.h> #include <linux/kernel.h> -#include <linux/interrupt.h> +#include <linux/types.h> +#include <linux/init.h> #include <linux/smp.h> #include <asm/processor.h> -#include <asm/system.h> +#include <asm/mce.h> #include <asm/msr.h> -#include <asm/apic.h> - -#include <asm/therm_throt.h> - -#include "mce.h" /* as supported by the P4/Xeon family */ struct intel_mce_extended_msrs { @@ -34,98 +27,8 @@ struct intel_mce_extended_msrs { static int mce_num_extended_msrs; - -#ifdef CONFIG_X86_MCE_P4THERMAL -static void unexpected_thermal_interrupt(struct pt_regs *regs) -{ - printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", - smp_processor_id()); - add_taint(TAINT_MACHINE_CHECK); -} - -/* P4/Xeon Thermal transition interrupt handler */ -static void intel_thermal_interrupt(struct pt_regs *regs) -{ - __u64 msr_val; - - ack_APIC_irq(); - - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - therm_throt_process(msr_val & 0x1); -} - -/* Thermal interrupt handler for this CPU setup */ -static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; - -void smp_thermal_interrupt(struct pt_regs *regs) -{ - irq_enter(); - vendor_thermal_interrupt(regs); - __get_cpu_var(irq_stat).irq_thermal_count++; - irq_exit(); -} - -/* P4/Xeon Thermal regulation detect and init */ -static void intel_init_thermal(struct cpuinfo_x86 *c) -{ - u32 l, h; - unsigned int cpu = smp_processor_id(); - - /* Thermal monitoring */ - if (!cpu_has(c, X86_FEATURE_ACPI)) - return; /* -ENODEV */ - - /* Clock modulation */ - if (!cpu_has(c, X86_FEATURE_ACC)) - return; /* -ENODEV */ - - /* first check if its enabled already, in which case there might - * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already -zwanem. - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); - if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", - cpu); - return; /* -EBUSY */ - } - - /* check whether a vector already exists, temporarily masked? */ - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " - "installed\n", - cpu, (h & APIC_VECTOR_MASK)); - return; /* -EBUSY */ - } - - /* The temperature transition interrupt handler setup */ - h = THERMAL_APIC_VECTOR; /* our delivery vector */ - h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); - - /* ok we're good to go... */ - vendor_thermal_interrupt = intel_thermal_interrupt; - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); - return; -} -#endif /* CONFIG_X86_MCE_P4THERMAL */ - - /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) +static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) { u32 h; @@ -143,9 +46,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) static void intel_machine_check(struct pt_regs *regs, long error_code) { - int recover = 1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; + int recover = 1; int i; rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -157,7 +60,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) if (mce_num_extended_msrs > 0) { struct intel_mce_extended_msrs dbg; + intel_get_extended_msrs(&dbg); + printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", @@ -171,6 +76,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) if (high & (1<<31)) { char misc[20]; char addr[24]; + misc[0] = addr[0] = '\0'; if (high & (1<<29)) recover |= 1; @@ -196,6 +102,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) panic("Unable to continue"); printk(KERN_EMERG "Attempting to continue.\n"); + /* * Do not clear the MSR_IA32_MCi_STATUS if the error is not * recoverable/continuable.This will allow BIOS to look at the MSRs @@ -217,7 +124,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); } - void intel_p4_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index c9f77ea69ed..5c0e6533d9b 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -2,52 +2,67 @@ * P5 specific Machine Check Exception Reporting * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> #include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h> #include <linux/smp.h> #include <asm/processor.h> #include <asm/system.h> +#include <asm/mce.h> #include <asm/msr.h> -#include "mce.h" +/* By default disabled */ +int mce_p5_enabled __read_mostly; -/* Machine check handler for Pentium class Intel */ +/* Machine check handler for Pentium class Intel CPUs: */ static void pentium_machine_check(struct pt_regs *regs, long error_code) { u32 loaddr, hi, lotype; + rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); - printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); - if (lotype&(1<<5)) - printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); + + printk(KERN_EMERG + "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", + smp_processor_id(), loaddr, lotype); + + if (lotype & (1<<5)) { + printk(KERN_EMERG + "CPU#%d: Possible thermal failure (CPU on fire ?).\n", + smp_processor_id()); + } + add_taint(TAINT_MACHINE_CHECK); } -/* Set up machine check reporting for processors with Intel style MCE */ +/* Set up machine check reporting for processors with Intel style MCE: */ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; - /*Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) + /* Default P5 to off as its often misconnected: */ + if (!mce_p5_enabled) return; - /* Default P5 to off as its often misconnected */ - if (mce_disabled != -1) + /* Check for MCE support: */ + if (!cpu_has(c, X86_FEATURE_MCE)) return; + machine_check_vector = pentium_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); - /* Read registers before enabling */ + /* Read registers before enabling: */ rdmsr(MSR_IA32_P5_MC_ADDR, l, h); rdmsr(MSR_IA32_P5_MC_TYPE, l, h); - printk(KERN_INFO "Intel old style machine check architecture supported.\n"); + printk(KERN_INFO + "Intel old style machine check architecture supported.\n"); - /* Enable MCE */ + /* Enable MCE: */ set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); + printk(KERN_INFO + "Intel old style machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); } diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c index 2ac52d7b434..01e4f817818 100644 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ b/arch/x86/kernel/cpu/mcheck/p6.c @@ -2,25 +2,23 @@ * P6 specific Machine Check Exception Reporting * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> #include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h> #include <linux/smp.h> #include <asm/processor.h> #include <asm/system.h> +#include <asm/mce.h> #include <asm/msr.h> -#include "mce.h" - /* Machine Check Handler For PII/PIII */ static void intel_machine_check(struct pt_regs *regs, long error_code) { - int recover = 1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; + int recover = 1; int i; rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -35,12 +33,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) if (high & (1<<31)) { char misc[20]; char addr[24]; - misc[0] = addr[0] = '\0'; + + misc[0] = '\0'; + addr[0] = '\0'; + if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; high &= ~(1<<31); + if (high & (1<<27)) { rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); snprintf(misc, 20, "[%08x%08x]", ahigh, alow); @@ -49,6 +51,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); snprintf(addr, 24, " at %08x%08x", ahigh, alow); } + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", smp_processor_id(), i, high, low, misc, addr); } @@ -63,16 +66,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) /* * Do not clear the MSR_IA32_MCi_STATUS if the error is not * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error. + * for errors if the OS could not log the error: */ for (i = 0; i < nr_mce_banks; i++) { unsigned int msr; + msr = MSR_IA32_MC0_STATUS+i*4; rdmsr(msr, low, high); if (high & (1<<31)) { - /* Clear it */ + /* Clear it: */ wrmsr(msr, 0UL, 0UL); - /* Serialize */ + /* Serialize: */ wmb(); add_taint(TAINT_MACHINE_CHECK); } @@ -81,7 +85,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); } -/* Set up machine check reporting for processors with Intel style MCE */ +/* Set up machine check reporting for processors with Intel style MCE: */ void intel_p6_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; @@ -97,6 +101,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c) /* Ok machine check is available */ machine_check_vector = intel_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); printk(KERN_INFO "Intel machine check architecture supported.\n"); diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d5ae2243f0b..5957a93e517 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -1,7 +1,7 @@ /* - * * Thermal throttle event support code (such as syslog messaging and rate * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). + * * This allows consistent reporting of CPU thermal throttle events. * * Maintains a counter in /sys that keeps track of the number of thermal @@ -13,43 +13,54 @@ * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. * Inspired by Ross Biro's and Al Borchers' counter code. */ - +#include <linux/interrupt.h> +#include <linux/notifier.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> #include <linux/percpu.h> #include <linux/sysdev.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/smp.h> #include <linux/cpu.h> -#include <asm/cpu.h> -#include <linux/notifier.h> -#include <linux/jiffies.h> -#include <asm/therm_throt.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/apic.h> +#include <asm/idle.h> +#include <asm/mce.h> +#include <asm/msr.h> /* How long to wait between reporting thermal events */ -#define CHECK_INTERVAL (300 * HZ) +#define CHECK_INTERVAL (300 * HZ) static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); -atomic_t therm_throt_en = ATOMIC_INIT(0); +static DEFINE_PER_CPU(bool, thermal_throttle_active); + +static atomic_t therm_throt_en = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS -#define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) - -#define define_therm_throt_sysdev_show_func(name) \ -static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ - struct sysdev_attribute *attr, \ - char *buf) \ -{ \ - unsigned int cpu = dev->id; \ - ssize_t ret; \ - \ - preempt_disable(); /* CPU hotplug */ \ - if (cpu_online(cpu)) \ - ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_throttle_##name, cpu)); \ - else \ - ret = 0; \ - preempt_enable(); \ - \ - return ret; \ +#define define_therm_throt_sysdev_one_ro(_name) \ + static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) + +#define define_therm_throt_sysdev_show_func(name) \ +static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ + char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + ssize_t ret; \ + \ + preempt_disable(); /* CPU hotplug */ \ + if (cpu_online(cpu)) \ + ret = sprintf(buf, "%lu\n", \ + per_cpu(thermal_throttle_##name, cpu)); \ + else \ + ret = 0; \ + preempt_enable(); \ + \ + return ret; \ } define_therm_throt_sysdev_show_func(count); @@ -61,8 +72,8 @@ static struct attribute *thermal_throttle_attrs[] = { }; static struct attribute_group thermal_throttle_attr_group = { - .attrs = thermal_throttle_attrs, - .name = "thermal_throttle" + .attrs = thermal_throttle_attrs, + .name = "thermal_throttle" }; #endif /* CONFIG_SYSFS */ @@ -82,38 +93,45 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -int therm_throt_process(int curr) +static int therm_throt_process(int curr) { unsigned int cpu = smp_processor_id(); __u64 tmp_jiffs = get_jiffies_64(); + bool was_throttled = __get_cpu_var(thermal_throttle_active); + bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; - if (curr) + if (is_throttled) __get_cpu_var(thermal_throttle_count)++; - if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) + if (!(was_throttled ^ is_throttled) && + time_before64(tmp_jiffs, __get_cpu_var(next_check))) return 0; __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; /* if we just entered the thermal event */ - if (curr) { + if (is_throttled) { printk(KERN_CRIT "CPU%d: Temperature above threshold, " - "cpu clock throttled (total events = %lu)\n", cpu, - __get_cpu_var(thermal_throttle_count)); + "cpu clock throttled (total events = %lu)\n", + cpu, __get_cpu_var(thermal_throttle_count)); add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); + return 1; + } + if (was_throttled) { + printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); + return 1; } - return 1; + return 0; } #ifdef CONFIG_SYSFS -/* Add/Remove thermal_throttle interface for CPU device */ +/* Add/Remove thermal_throttle interface for CPU device: */ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) { - return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); + return sysfs_create_group(&sys_dev->kobj, + &thermal_throttle_attr_group); } static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) @@ -121,19 +139,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); } -/* Mutex protecting device creation against CPU hotplug */ +/* Mutex protecting device creation against CPU hotplug: */ static DEFINE_MUTEX(therm_cpu_lock); /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static __cpuinit int +thermal_throttle_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct sys_device *sys_dev; int err = 0; sys_dev = get_cpu_sysdev(cpu); + switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: @@ -183,6 +203,94 @@ static __init int thermal_throttle_init_device(void) return 0; } - device_initcall(thermal_throttle_init_device); + #endif /* CONFIG_SYSFS */ + +/* Thermal transition interrupt handler */ +static void intel_thermal_interrupt(void) +{ + __u64 msr_val; + + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) + mce_log_therm_throt_event(msr_val); +} + +static void unexpected_thermal_interrupt(void) +{ + printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", + smp_processor_id()); + add_taint(TAINT_MACHINE_CHECK); +} + +static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; + +asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +{ + exit_idle(); + irq_enter(); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); + irq_exit(); + /* Ack only at the end to avoid potential reentry */ + ack_APIC_irq(); +} + +void intel_init_thermal(struct cpuinfo_x86 *c) +{ + unsigned int cpu = smp_processor_id(); + int tm2 = 0; + u32 l, h; + + /* Thermal monitoring depends on ACPI and clock modulation*/ + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return; + + /* + * First check if its enabled already, in which case there might + * be some SMM goo which handles it, so we can't even put a handler + * since it might be delivered via SMI already: + */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = apic_read(APIC_LVTTHMR); + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG + "CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; + } + + if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) + tm2 = 1; + + /* Check whether a vector already exists */ + if (h & APIC_VECTOR_MASK) { + printk(KERN_DEBUG + "CPU%d: Thermal LVT vector (%#x) already installed\n", + cpu, (h & APIC_VECTOR_MASK)); + return; + } + + /* We'll mask the thermal vector in the lapic till we're ready: */ + h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; + apic_write(APIC_LVTTHMR, h); + + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + + smp_thermal_vector = intel_thermal_interrupt; + + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + + /* Unmask the thermal vector: */ + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", + cpu, tm2 ? "TM2" : "TM1"); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); +} diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 23ee9e730f7..d746df2909c 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -17,7 +17,7 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -asmlinkage void mce_threshold_interrupt(void) +asmlinkage void smp_threshold_interrupt(void) { exit_idle(); irq_enter(); diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 2a043d89811..54060f56597 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -2,19 +2,17 @@ * IDT Winchip specific Machine Check Exception Reporting * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> #include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h> #include <asm/processor.h> #include <asm/system.h> +#include <asm/mce.h> #include <asm/msr.h> -#include "mce.h" - -/* Machine check handler for WinChip C6 */ +/* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); @@ -25,12 +23,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code) void winchip_mcheck_init(struct cpuinfo_x86 *c) { u32 lo, hi; + machine_check_vector = winchip_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); + rdmsr(MSR_IDT_FCR1, lo, hi); lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ lo &= ~(1<<4); /* Enable MCE */ wrmsr(MSR_IDT_FCR1, lo, hi); + set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); + + printk(KERN_INFO + "Winchip machine check reporting enabled on CPU#0.\n"); } diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index ce0fe4b5c04..1d584a18a50 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits) if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) return 0; - rdmsr(MTRRdefType_MSR, def, dummy); + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; @@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) */ if (!is_cpu(INTEL) || disable_mtrr_trim) return 0; - rdmsr(MTRRdefType_MSR, def, dummy); + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index d21d4fb161f..0543f69f0b2 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -20,9 +20,9 @@ struct fixed_range_block { }; static struct fixed_range_block fixed_range_blocks[] = { - { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ - { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ - { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ + { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ + { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ + { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ {} }; @@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs) k8_check_syscfg_dram_mod_en(); - rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); + rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); for (i = 0; i < 2; i++) - rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); + rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); for (i = 0; i < 8; i++) - rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); + rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]); } void mtrr_save_fixed_ranges(void *info) @@ -310,7 +310,7 @@ void __init get_mtrr_state(void) vrs = mtrr_state.var_ranges; - rdmsr(MTRRcap_MSR, lo, dummy); + rdmsr(MSR_MTRRcap, lo, dummy); mtrr_state.have_fixed = (lo >> 8) & 1; for (i = 0; i < num_var_ranges; i++) @@ -318,7 +318,7 @@ void __init get_mtrr_state(void) if (mtrr_state.have_fixed) get_fixed_ranges(mtrr_state.fixed_ranges); - rdmsr(MTRRdefType_MSR, lo, dummy); + rdmsr(MSR_MTRRdefType, lo, dummy); mtrr_state.def_type = (lo & 0xff); mtrr_state.enabled = (lo & 0xc00) >> 10; @@ -583,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock) __flush_tlb(); /* Save MTRR state */ - rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); } static void post_set(void) __releases(set_atomicity_lock) @@ -595,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock) __flush_tlb(); /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); @@ -707,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i static int generic_have_wrcomb(void) { unsigned long config, dummy; - rdmsr(MTRRcap_MSR, config, dummy); + rdmsr(MSR_MTRRcap, config, dummy); return (config & (1 << 10)); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 03cda01f57c..8fc248b5aea 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void) unsigned long config = 0, dummy; if (use_intel()) { - rdmsr(MTRRcap_MSR, config, dummy); + rdmsr(MSR_MTRRcap, config, dummy); } else if (is_cpu(AMD)) config = 2; else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 77f67f7b347..7538b767f20 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -5,21 +5,6 @@ #include <linux/types.h> #include <linux/stddef.h> -#define MTRRcap_MSR 0x0fe -#define MTRRdefType_MSR 0x2ff - -#define MTRRfix64K_00000_MSR 0x250 -#define MTRRfix16K_80000_MSR 0x258 -#define MTRRfix16K_A0000_MSR 0x259 -#define MTRRfix4K_C0000_MSR 0x268 -#define MTRRfix4K_C8000_MSR 0x269 -#define MTRRfix4K_D0000_MSR 0x26a -#define MTRRfix4K_D8000_MSR 0x26b -#define MTRRfix4K_E0000_MSR 0x26c -#define MTRRfix4K_E8000_MSR 0x26d -#define MTRRfix4K_F0000_MSR 0x26e -#define MTRRfix4K_F8000_MSR 0x26f - #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 7f7e2753685..1f5fb1588d1 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) if (use_intel()) /* Save MTRR state */ - rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); else /* Cyrix ARRs - everything else were excluded at the top */ ctxt->ccr3 = getCx86(CX86_CCR3); @@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) { if (use_intel()) /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi); else if (is_cpu(CYRIX)) /* Cyrix ARRs - everything else were excluded at the top */ @@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt) /* Restore MTRRdefType */ if (use_intel()) /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); else /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, ctxt->ccr3); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c new file mode 100644 index 00000000000..900332b800f --- /dev/null +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -0,0 +1,1964 @@ +/* + * Performance counter x86 architecture code + * + * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + * For licencing details see kernel-base/COPYING + */ + +#include <linux/perf_counter.h> +#include <linux/capability.h> +#include <linux/notifier.h> +#include <linux/hardirq.h> +#include <linux/kprobes.h> +#include <linux/module.h> +#include <linux/kdebug.h> +#include <linux/sched.h> +#include <linux/uaccess.h> +#include <linux/highmem.h> + +#include <asm/apic.h> +#include <asm/stacktrace.h> +#include <asm/nmi.h> + +static u64 perf_counter_mask __read_mostly; + +struct cpu_hw_counters { + struct perf_counter *counters[X86_PMC_IDX_MAX]; + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long interrupts; + int enabled; +}; + +/* + * struct x86_pmu - generic x86 pmu + */ +struct x86_pmu { + const char *name; + int version; + int (*handle_irq)(struct pt_regs *); + void (*disable_all)(void); + void (*enable_all)(void); + void (*enable)(struct hw_perf_counter *, int); + void (*disable)(struct hw_perf_counter *, int); + unsigned eventsel; + unsigned perfctr; + u64 (*event_map)(int); + u64 (*raw_event)(u64); + int max_events; + int num_counters; + int num_counters_fixed; + int counter_bits; + u64 counter_mask; + int apic; + u64 max_period; + u64 intel_ctrl; +}; + +static struct x86_pmu x86_pmu __read_mostly; + +static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { + .enabled = 1, +}; + +/* + * Not sure about some of these + */ +static const u64 p6_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, +}; + +static u64 p6_pmu_event_map(int event) +{ + return p6_perfmon_event_map[event]; +} + +/* + * Counter setting that is specified not to count anything. + * We use this to effectively disable a counter. + * + * L2_RQSTS with 0 MESI unit mask. + */ +#define P6_NOP_COUNTER 0x0000002EULL + +static u64 p6_pmu_raw_event(u64 event) +{ +#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL +#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL +#define P6_EVNTSEL_INV_MASK 0x00800000ULL +#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL + +#define P6_EVNTSEL_MASK \ + (P6_EVNTSEL_EVENT_MASK | \ + P6_EVNTSEL_UNIT_MASK | \ + P6_EVNTSEL_EDGE_MASK | \ + P6_EVNTSEL_INV_MASK | \ + P6_EVNTSEL_COUNTER_MASK) + + return event & P6_EVNTSEL_MASK; +} + + +/* + * Intel PerfMon v3. Used on Core2 and later. + */ +static const u64 intel_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, +}; + +static u64 intel_pmu_event_map(int event) +{ + return intel_perfmon_event_map[event]; +} + +/* + * Generalized hw caching related event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'event makes no sense on + * this CPU', any other value means the raw event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +static u64 __read_mostly hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + +static const u64 nehalem_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static const u64 core2_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static const u64 atom_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static u64 intel_pmu_raw_event(u64 event) +{ +#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL +#define CORE_EVNTSEL_INV_MASK 0x00800000ULL +#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL + +#define CORE_EVNTSEL_MASK \ + (CORE_EVNTSEL_EVENT_MASK | \ + CORE_EVNTSEL_UNIT_MASK | \ + CORE_EVNTSEL_EDGE_MASK | \ + CORE_EVNTSEL_INV_MASK | \ + CORE_EVNTSEL_COUNTER_MASK) + + return event & CORE_EVNTSEL_MASK; +} + +static const u64 amd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ + [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ + [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ + [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ + [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ + [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +/* + * AMD Performance Monitor K7 and later. + */ +static const u64 amd_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, +}; + +static u64 amd_pmu_event_map(int event) +{ + return amd_perfmon_event_map[event]; +} + +static u64 amd_pmu_raw_event(u64 event) +{ +#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL +#define K7_EVNTSEL_INV_MASK 0x000800000ULL +#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL + +#define K7_EVNTSEL_MASK \ + (K7_EVNTSEL_EVENT_MASK | \ + K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_EDGE_MASK | \ + K7_EVNTSEL_INV_MASK | \ + K7_EVNTSEL_COUNTER_MASK) + + return event & K7_EVNTSEL_MASK; +} + +/* + * Propagate counter elapsed time into the generic counter. + * Can only be executed on the CPU where the counter is active. + * Returns the delta events processed. + */ +static u64 +x86_perf_counter_update(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) +{ + int shift = 64 - x86_pmu.counter_bits; + u64 prev_raw_count, new_raw_count; + s64 delta; + + /* + * Careful: an NMI might modify the previous counter value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic counter atomically: + */ +again: + prev_raw_count = atomic64_read(&hwc->prev_count); + rdmsrl(hwc->counter_base + idx, new_raw_count); + + if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (counter-)time and add that to the generic counter. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + atomic64_add(delta, &counter->count); + atomic64_sub(delta, &hwc->period_left); + + return new_raw_count; +} + +static atomic_t active_counters; +static DEFINE_MUTEX(pmc_reserve_mutex); + +static bool reserve_pmc_hardware(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int i; + + if (nmi_watchdog == NMI_LOCAL_APIC) + disable_lapic_nmi_watchdog(); + + for (i = 0; i < x86_pmu.num_counters; i++) { + if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) + goto perfctr_fail; + } + + for (i = 0; i < x86_pmu.num_counters; i++) { + if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) + goto eventsel_fail; + } +#endif + + return true; + +#ifdef CONFIG_X86_LOCAL_APIC +eventsel_fail: + for (i--; i >= 0; i--) + release_evntsel_nmi(x86_pmu.eventsel + i); + + i = x86_pmu.num_counters; + +perfctr_fail: + for (i--; i >= 0; i--) + release_perfctr_nmi(x86_pmu.perfctr + i); + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); + + return false; +#endif +} + +static void release_pmc_hardware(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int i; + + for (i = 0; i < x86_pmu.num_counters; i++) { + release_perfctr_nmi(x86_pmu.perfctr + i); + release_evntsel_nmi(x86_pmu.eventsel + i); + } + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); +#endif +} + +static void hw_perf_counter_destroy(struct perf_counter *counter) +{ + if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + +static inline int x86_pmu_initialized(void) +{ + return x86_pmu.handle_irq != NULL; +} + +static inline int +set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) +{ + unsigned int cache_type, cache_op, cache_result; + u64 config, val; + + config = attr->config; + + cache_type = (config >> 0) & 0xff; + if (cache_type >= PERF_COUNT_HW_CACHE_MAX) + return -EINVAL; + + cache_op = (config >> 8) & 0xff; + if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) + return -EINVAL; + + cache_result = (config >> 16) & 0xff; + if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + val = hw_cache_event_ids[cache_type][cache_op][cache_result]; + + if (val == 0) + return -ENOENT; + + if (val == -1) + return -EINVAL; + + hwc->config |= val; + + return 0; +} + +/* + * Setup the hardware configuration for a given attr_type + */ +static int __hw_perf_counter_init(struct perf_counter *counter) +{ + struct perf_counter_attr *attr = &counter->attr; + struct hw_perf_counter *hwc = &counter->hw; + u64 config; + int err; + + if (!x86_pmu_initialized()) + return -ENODEV; + + err = 0; + if (!atomic_inc_not_zero(&active_counters)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) + err = -EBUSY; + else + atomic_inc(&active_counters); + mutex_unlock(&pmc_reserve_mutex); + } + if (err) + return err; + + /* + * Generate PMC IRQs: + * (keep 'enabled' bit clear for now) + */ + hwc->config = ARCH_PERFMON_EVENTSEL_INT; + + /* + * Count user and OS events unless requested not to. + */ + if (!attr->exclude_user) + hwc->config |= ARCH_PERFMON_EVENTSEL_USR; + if (!attr->exclude_kernel) + hwc->config |= ARCH_PERFMON_EVENTSEL_OS; + + if (!hwc->sample_period) { + hwc->sample_period = x86_pmu.max_period; + hwc->last_period = hwc->sample_period; + atomic64_set(&hwc->period_left, hwc->sample_period); + } else { + /* + * If we have a PMU initialized but no APIC + * interrupts, we cannot sample hardware + * counters (user-space has to fall back and + * sample via a hrtimer based software counter): + */ + if (!x86_pmu.apic) + return -EOPNOTSUPP; + } + + counter->destroy = hw_perf_counter_destroy; + + /* + * Raw event type provide the config in the event structure + */ + if (attr->type == PERF_TYPE_RAW) { + hwc->config |= x86_pmu.raw_event(attr->config); + return 0; + } + + if (attr->type == PERF_TYPE_HW_CACHE) + return set_ext_hw_attr(hwc, attr); + + if (attr->config >= x86_pmu.max_events) + return -EINVAL; + + /* + * The generic map: + */ + config = x86_pmu.event_map(attr->config); + + if (config == 0) + return -ENOENT; + + if (config == -1LL) + return -EINVAL; + + hwc->config |= config; + + return 0; +} + +static void p6_pmu_disable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + u64 val; + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static void intel_pmu_disable_all(void) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); +} + +static void amd_pmu_disable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + int idx; + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + /* + * ensure we write the disable before we start disabling the + * counters proper, so that amd_pmu_enable_counter() does the + * right thing. + */ + barrier(); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + u64 val; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) + continue; + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } +} + +void hw_perf_disable(void) +{ + if (!x86_pmu_initialized()) + return; + return x86_pmu.disable_all(); +} + +static void p6_pmu_enable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + unsigned long val; + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static void intel_pmu_enable_all(void) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); +} + +static void amd_pmu_enable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + int idx; + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_counter *counter = cpuc->counters[idx]; + u64 val; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + + val = counter->hw.config; + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } +} + +void hw_perf_enable(void) +{ + if (!x86_pmu_initialized()) + return; + x86_pmu.enable_all(); +} + +static inline u64 intel_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void intel_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ + (void)checking_wrmsrl(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); +} + +static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ + (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); +} + +static inline void +intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, mask; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + (void)checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static inline void +p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + u64 val = P6_NOP_COUNTER; + + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); +} + +static inline void +intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_disable_fixed(hwc, idx); + return; + } + + x86_pmu_disable_counter(hwc, idx); +} + +static inline void +amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ + x86_pmu_disable_counter(hwc, idx); +} + +static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); + +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the counter disabled in hw: + */ +static int +x86_perf_counter_set_period(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) +{ + s64 left = atomic64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int err, ret = 0; + + /* + * If we are way outside a reasoable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + /* + * Quirk: certain CPUs dont like it if just 1 event is left: + */ + if (unlikely(left < 2)) + left = 2; + + if (left > x86_pmu.max_period) + left = x86_pmu.max_period; + + per_cpu(prev_left[idx], smp_processor_id()) = left; + + /* + * The hw counter starts counting from this counter offset, + * mark it to be able to extra future deltas: + */ + atomic64_set(&hwc->prev_count, (u64)-left); + + err = checking_wrmsrl(hwc->counter_base + idx, + (u64)(-left) & x86_pmu.counter_mask); + + perf_counter_update_userpage(counter); + + return ret; +} + +static inline void +intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + int err; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + u64 val; + + val = hwc->config; + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); +} + + +static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_enable_fixed(hwc, idx); + return; + } + + x86_pmu_enable_counter(hwc, idx); +} + +static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (cpuc->enabled) + x86_pmu_enable_counter(hwc, idx); +} + +static int +fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) +{ + unsigned int event; + + if (!x86_pmu.num_counters_fixed) + return -1; + + event = hwc->config & ARCH_PERFMON_EVENT_MASK; + + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) + return X86_PMC_IDX_FIXED_INSTRUCTIONS; + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) + return X86_PMC_IDX_FIXED_CPU_CYCLES; + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) + return X86_PMC_IDX_FIXED_BUS_CYCLES; + + return -1; +} + +/* + * Find a PMC slot for the freshly enabled / scheduled in counter: + */ +static int x86_pmu_enable(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + int idx; + + idx = fixed_mode_idx(counter, hwc); + if (idx >= 0) { + /* + * Try to get the fixed counter, if that is already taken + * then try to get a generic counter: + */ + if (test_and_set_bit(idx, cpuc->used_mask)) + goto try_generic; + + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + /* + * We set it so that counter_base + idx in wrmsr/rdmsr maps to + * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: + */ + hwc->counter_base = + MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; + hwc->idx = idx; + } else { + idx = hwc->idx; + /* Try to get the previous generic counter again */ + if (test_and_set_bit(idx, cpuc->used_mask)) { +try_generic: + idx = find_first_zero_bit(cpuc->used_mask, + x86_pmu.num_counters); + if (idx == x86_pmu.num_counters) + return -EAGAIN; + + set_bit(idx, cpuc->used_mask); + hwc->idx = idx; + } + hwc->config_base = x86_pmu.eventsel; + hwc->counter_base = x86_pmu.perfctr; + } + + perf_counters_lapic_init(); + + x86_pmu.disable(hwc, idx); + + cpuc->counters[idx] = counter; + set_bit(idx, cpuc->active_mask); + + x86_perf_counter_set_period(counter, hwc, idx); + x86_pmu.enable(hwc, idx); + + perf_counter_update_userpage(counter); + + return 0; +} + +static void x86_pmu_unthrottle(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + + if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || + cpuc->counters[hwc->idx] != counter)) + return; + + x86_pmu.enable(hwc, hwc->idx); +} + +void perf_counter_print_debug(void) +{ + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; + struct cpu_hw_counters *cpuc; + unsigned long flags; + int cpu, idx; + + if (!x86_pmu.num_counters) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); + + if (x86_pmu.version >= 2) { + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); + + pr_info("\n"); + pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); + pr_info("CPU#%d: status: %016llx\n", cpu, status); + pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); + pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); + } + pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); + rdmsrl(x86_pmu.perfctr + idx, pmc_count); + + prev_left = per_cpu(prev_left[idx], cpu); + + pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", + cpu, idx, pmc_ctrl); + pr_info("CPU#%d: gen-PMC%d count: %016llx\n", + cpu, idx, pmc_count); + pr_info("CPU#%d: gen-PMC%d left: %016llx\n", + cpu, idx, prev_left); + } + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); + + pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", + cpu, idx, pmc_count); + } + local_irq_restore(flags); +} + +static void x86_pmu_disable(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + int idx = hwc->idx; + + /* + * Must be done before we disable, otherwise the nmi handler + * could reenable again: + */ + clear_bit(idx, cpuc->active_mask); + x86_pmu.disable(hwc, idx); + + /* + * Make sure the cleared pointer becomes visible before we + * (potentially) free the counter: + */ + barrier(); + + /* + * Drain the remaining delta count out of a counter + * that we are disabling: + */ + x86_perf_counter_update(counter, hwc, idx); + cpuc->counters[idx] = NULL; + clear_bit(idx, cpuc->used_mask); + + perf_counter_update_userpage(counter); +} + +/* + * Save and restart an expired counter. Called by NMI contexts, + * so it has to be careful about preempting normal counter ops: + */ +static int intel_pmu_save_and_restart(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + int idx = hwc->idx; + int ret; + + x86_perf_counter_update(counter, hwc, idx); + ret = x86_perf_counter_set_period(counter, hwc, idx); + + if (counter->state == PERF_COUNTER_STATE_ACTIVE) + intel_pmu_enable_counter(hwc, idx); + + return ret; +} + +static void intel_pmu_reset(void) +{ + unsigned long flags; + int idx; + + if (!x86_pmu.num_counters) + return; + + local_irq_save(flags); + + printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); + checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + } + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } + + local_irq_restore(flags); +} + +static int p6_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_counters *cpuc; + struct perf_counter *counter; + struct hw_perf_counter *hwc; + int idx, handled = 0; + u64 val; + + data.regs = regs; + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_counters); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + counter = cpuc->counters[idx]; + hwc = &counter->hw; + + val = x86_perf_counter_update(counter, hwc, idx); + if (val & (1ULL << (x86_pmu.counter_bits - 1))) + continue; + + /* + * counter overflow + */ + handled = 1; + data.period = counter->hw.last_period; + + if (!x86_perf_counter_set_period(counter, hwc, idx)) + continue; + + if (perf_counter_overflow(counter, 1, &data)) + p6_pmu_disable_counter(hwc, idx); + } + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_counters *cpuc; + int bit, loops; + u64 ack, status; + + data.regs = regs; + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_counters); + + perf_disable(); + status = intel_pmu_get_status(); + if (!status) { + perf_enable(); + return 0; + } + + loops = 0; +again: + if (++loops > 100) { + WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); + perf_counter_print_debug(); + intel_pmu_reset(); + perf_enable(); + return 1; + } + + inc_irq_stat(apic_perf_irqs); + ack = status; + for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_counter *counter = cpuc->counters[bit]; + + clear_bit(bit, (unsigned long *) &status); + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(counter)) + continue; + + data.period = counter->hw.last_period; + + if (perf_counter_overflow(counter, 1, &data)) + intel_pmu_disable_counter(&counter->hw, bit); + } + + intel_pmu_ack_status(ack); + + /* + * Repeat if there is more work to be done: + */ + status = intel_pmu_get_status(); + if (status) + goto again; + + perf_enable(); + + return 1; +} + +static int amd_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_counters *cpuc; + struct perf_counter *counter; + struct hw_perf_counter *hwc; + int idx, handled = 0; + u64 val; + + data.regs = regs; + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_counters); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + counter = cpuc->counters[idx]; + hwc = &counter->hw; + + val = x86_perf_counter_update(counter, hwc, idx); + if (val & (1ULL << (x86_pmu.counter_bits - 1))) + continue; + + /* + * counter overflow + */ + handled = 1; + data.period = counter->hw.last_period; + + if (!x86_perf_counter_set_period(counter, hwc, idx)) + continue; + + if (perf_counter_overflow(counter, 1, &data)) + amd_pmu_disable_counter(hwc, idx); + } + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} + +void smp_perf_pending_interrupt(struct pt_regs *regs) +{ + irq_enter(); + ack_APIC_irq(); + inc_irq_stat(apic_pending_irqs); + perf_counter_do_pending(); + irq_exit(); +} + +void set_perf_counter_pending(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + apic->send_IPI_self(LOCAL_PENDING_VECTOR); +#endif +} + +void perf_counters_lapic_init(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) + return; + + /* + * Always use NMI for PMU + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif +} + +static int __kprobes +perf_counter_nmi_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct pt_regs *regs; + + if (!atomic_read(&active_counters)) + return NOTIFY_DONE; + + switch (cmd) { + case DIE_NMI: + case DIE_NMI_IPI: + break; + + default: + return NOTIFY_DONE; + } + + regs = args->regs; + +#ifdef CONFIG_X86_LOCAL_APIC + apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif + /* + * Can't rely on the handled return value to say it was our NMI, two + * counters could trigger 'simultaneously' raising two back-to-back NMIs. + * + * If the first NMI handles both, the latter will be empty and daze + * the CPU. + */ + x86_pmu.handle_irq(regs); + + return NOTIFY_STOP; +} + +static __read_mostly struct notifier_block perf_counter_nmi_notifier = { + .notifier_call = perf_counter_nmi_handler, + .next = NULL, + .priority = 1 +}; + +static struct x86_pmu p6_pmu = { + .name = "p6", + .handle_irq = p6_pmu_handle_irq, + .disable_all = p6_pmu_disable_all, + .enable_all = p6_pmu_enable_all, + .enable = p6_pmu_enable_counter, + .disable = p6_pmu_disable_counter, + .eventsel = MSR_P6_EVNTSEL0, + .perfctr = MSR_P6_PERFCTR0, + .event_map = p6_pmu_event_map, + .raw_event = p6_pmu_raw_event, + .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, + .max_period = (1ULL << 31) - 1, + .version = 0, + .num_counters = 2, + /* + * Counters have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a counter for P6-like PMU is 32 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ + .counter_bits = 32, + .counter_mask = (1ULL << 32) - 1, +}; + +static struct x86_pmu intel_pmu = { + .name = "Intel", + .handle_irq = intel_pmu_handle_irq, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, + .enable = intel_pmu_enable_counter, + .disable = intel_pmu_disable_counter, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic counter period: + */ + .max_period = (1ULL << 31) - 1, +}; + +static struct x86_pmu amd_pmu = { + .name = "AMD", + .handle_irq = amd_pmu_handle_irq, + .disable_all = amd_pmu_disable_all, + .enable_all = amd_pmu_enable_all, + .enable = amd_pmu_enable_counter, + .disable = amd_pmu_disable_counter, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_counters = 4, + .counter_bits = 48, + .counter_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, +}; + +static int p6_pmu_init(void) +{ + switch (boot_cpu_data.x86_model) { + case 1: + case 3: /* Pentium Pro */ + case 5: + case 6: /* Pentium II */ + case 7: + case 8: + case 11: /* Pentium III */ + break; + case 9: + case 13: + /* Pentium M */ + break; + default: + pr_cont("unsupported p6 CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + x86_pmu = p6_pmu; + + if (!cpu_has_apic) { + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); + pr_info("no hardware sampling interrupt available.\n"); + x86_pmu.apic = 0; + } + + return 0; +} + +static int intel_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int ebx; + int version; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + /* check for P6 processor family */ + if (boot_cpu_data.x86 == 6) { + return p6_pmu_init(); + } else { + return -ENODEV; + } + } + + /* + * Check whether the Architectural PerfMon supports + * Branch Misses Retired Event or not. + */ + cpuid(10, &eax.full, &ebx, &unused, &edx.full); + if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + return -ENODEV; + + version = eax.split.version_id; + if (version < 2) + return -ENODEV; + + x86_pmu = intel_pmu; + x86_pmu.version = version; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.counter_bits = eax.split.bit_width; + x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; + + /* + * Quirk: v2 perfmon does not report fixed-purpose counters, so + * assume at least 3 counters: + */ + x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); + + /* + * Install the hw-cache-events table: + */ + switch (boot_cpu_data.x86_model) { + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 29: /* six-core 45 nm xeon "Dunnington" */ + memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Core2 events, "); + break; + default: + case 26: + memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Nehalem/Corei7 events, "); + break; + case 28: + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Atom events, "); + break; + } + return 0; +} + +static int amd_pmu_init(void) +{ + /* Performance-monitoring supported from K7 and later: */ + if (boot_cpu_data.x86 < 6) + return -ENODEV; + + x86_pmu = amd_pmu; + + /* Events are common for all AMDs */ + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + return 0; +} + +void __init init_hw_perf_counters(void) +{ + int err; + + pr_info("Performance Counters: "); + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + err = intel_pmu_init(); + break; + case X86_VENDOR_AMD: + err = amd_pmu_init(); + break; + default: + return; + } + if (err != 0) { + pr_cont("no PMU driver, software counters only.\n"); + return; + } + + pr_cont("%s PMU driver.\n", x86_pmu.name); + + if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", + x86_pmu.num_counters, X86_PMC_MAX_GENERIC); + x86_pmu.num_counters = X86_PMC_MAX_GENERIC; + } + perf_counter_mask = (1 << x86_pmu.num_counters) - 1; + perf_max_counters = x86_pmu.num_counters; + + if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", + x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; + } + + perf_counter_mask |= + ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + x86_pmu.intel_ctrl = perf_counter_mask; + + perf_counters_lapic_init(); + register_die_notifier(&perf_counter_nmi_notifier); + + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.counter_bits); + pr_info("... generic counters: %d\n", x86_pmu.num_counters); + pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); + pr_info("... counter mask: %016Lx\n", perf_counter_mask); +} + +static inline void x86_pmu_read(struct perf_counter *counter) +{ + x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); +} + +static const struct pmu pmu = { + .enable = x86_pmu_enable, + .disable = x86_pmu_disable, + .read = x86_pmu_read, + .unthrottle = x86_pmu_unthrottle, +}; + +const struct pmu *hw_perf_counter_init(struct perf_counter *counter) +{ + int err; + + err = __hw_perf_counter_init(counter); + if (err) + return ERR_PTR(err); + + return &pmu; +} + +/* + * callchain support + */ + +static inline +void callchain_store(struct perf_callchain_entry *entry, u64 ip) +{ + if (entry->nr < PERF_MAX_STACK_DEPTH) + entry->ip[entry->nr++] = ip; +} + +static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); +static DEFINE_PER_CPU(int, in_nmi_frame); + + +static void +backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + /* Ignore warnings */ +} + +static void backtrace_warning(void *data, char *msg) +{ + /* Ignore warnings */ +} + +static int backtrace_stack(void *data, char *name) +{ + per_cpu(in_nmi_frame, smp_processor_id()) = + x86_is_stack_id(NMI_STACK, name); + + return 0; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ + struct perf_callchain_entry *entry = data; + + if (per_cpu(in_nmi_frame, smp_processor_id())) + return; + + if (reliable) + callchain_store(entry, addr); +} + +static const struct stacktrace_ops backtrace_ops = { + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, + .address = backtrace_address, +}; + +#include "../dumpstack.h" + +static void +perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + callchain_store(entry, PERF_CONTEXT_KERNEL); + callchain_store(entry, regs->ip); + + dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); +} + +/* + * best effort, GUP based copy_from_user() that assumes IRQ or NMI context + */ +static unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +{ + unsigned long offset, addr = (unsigned long)from; + int type = in_nmi() ? KM_NMI : KM_IRQ0; + unsigned long size, len = 0; + struct page *page; + void *map; + int ret; + + do { + ret = __get_user_pages_fast(addr, 1, 0, &page); + if (!ret) + break; + + offset = addr & (PAGE_SIZE - 1); + size = min(PAGE_SIZE - offset, n - len); + + map = kmap_atomic(page, type); + memcpy(to, map+offset, size); + kunmap_atomic(map, type); + put_page(page); + + len += size; + to += size; + addr += size; + + } while (len < n); + + return len; +} + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + unsigned long bytes; + + bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); + + return bytes == sizeof(*frame); +} + +static void +perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + struct stack_frame frame; + const void __user *fp; + + if (!user_mode(regs)) + regs = task_pt_regs(current); + + fp = (void __user *)regs->bp; + + callchain_store(entry, PERF_CONTEXT_USER); + callchain_store(entry, regs->ip); + + while (entry->nr < PERF_MAX_STACK_DEPTH) { + frame.next_frame = NULL; + frame.return_address = 0; + + if (!copy_stack_frame(fp, &frame)) + break; + + if ((unsigned long)fp < regs->sp) + break; + + callchain_store(entry, frame.return_address); + fp = frame.next_frame; + } +} + +static void +perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + int is_user; + + if (!regs) + return; + + is_user = user_mode(regs); + + if (!current || current->pid == 0) + return; + + if (is_user && current->state != TASK_RUNNING) + return; + + if (!is_user) + perf_callchain_kernel(regs, entry); + + if (current->mm) + perf_callchain_user(regs, entry); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + struct perf_callchain_entry *entry; + + if (in_nmi()) + entry = &__get_cpu_var(nmi_entry); + else + entry = &__get_cpu_var(irq_entry); + + entry->nr = 0; + + perf_do_callchain(regs, entry); + + return entry; +} diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index f6c70a164e3..e60ed740d2b 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -19,8 +19,8 @@ #include <linux/nmi.h> #include <linux/kprobes.h> -#include <asm/genapic.h> -#include <asm/intel_arch_perfmon.h> +#include <asm/apic.h> +#include <asm/perf_counter.h> struct nmi_watchdog_ctlblk { unsigned int cccr_msr; @@ -716,11 +716,15 @@ static void probe_nmi_watchdog(void) wd_ops = &k7_wd_ops; break; case X86_VENDOR_INTEL: - /* - * Work around Core Duo (Yonah) errata AE49 where perfctr1 - * doesn't have a working enable bit. + /* Work around where perfctr1 doesn't have a working enable + * bit as described in the following errata: + * AE49 Core Duo and Intel Core Solo 65 nm + * AN49 Intel Pentium Dual-Core + * AF49 Dual-Core Intel Xeon Processor LV */ - if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { + if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) || + ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 && + boot_cpu_data.x86_mask == 4))) { intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; } @@ -799,8 +803,3 @@ int __kprobes lapic_wd_event(unsigned nmi_hz) wd_ops->rearm(wd, nmi_hz); return 1; } - -int lapic_watchdog_ok(void) -{ - return wd_ops != NULL; -} diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 2ac1f0c2beb..b07af886124 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -182,6 +182,11 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier = .notifier_call = cpuid_class_cpu_callback, }; +static char *cpuid_nodename(struct device *dev) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); +} + static int __init cpuid_init(void) { int i, err = 0; @@ -198,6 +203,7 @@ static int __init cpuid_init(void) err = PTR_ERR(cpuid_class); goto out_chrdev; } + cpuid_class->nodename = cpuid_nodename; for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index ff958248e61..5e409dc298a 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -27,6 +27,7 @@ #include <asm/cpu.h> #include <asm/reboot.h> #include <asm/virtext.h> +#include <asm/iommu.h> #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) @@ -103,5 +104,10 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif + +#ifdef CONFIG_X86_64 + pci_iommu_shutdown(); +#endif + crash_save_cpu(regs, safe_smp_processor_id()); } diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 87b67e3a765..48bfe138603 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -19,45 +19,61 @@ * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009 */ - -#include <asm/ds.h> - -#include <linux/errno.h> +#include <linux/kernel.h> #include <linux/string.h> -#include <linux/slab.h> +#include <linux/errno.h> #include <linux/sched.h> +#include <linux/slab.h> #include <linux/mm.h> -#include <linux/kernel.h> +#include <linux/trace_clock.h> + +#include <asm/ds.h> +#include "ds_selftest.h" /* - * The configuration for a particular DS hardware implementation. + * The configuration for a particular DS hardware implementation: */ struct ds_configuration { - /* the name of the configuration */ - const char *name; - /* the size of one pointer-typed field in the DS structure and - in the BTS and PEBS buffers in bytes; - this covers the first 8 DS fields related to buffer management. */ - unsigned char sizeof_field; - /* the size of a BTS/PEBS record in bytes */ - unsigned char sizeof_rec[2]; - /* a series of bit-masks to control various features indexed - * by enum ds_feature */ - unsigned long ctl[dsf_ctl_max]; + /* The name of the configuration: */ + const char *name; + + /* The size of pointer-typed fields in DS, BTS, and PEBS: */ + unsigned char sizeof_ptr_field; + + /* The size of a BTS/PEBS record in bytes: */ + unsigned char sizeof_rec[2]; + + /* The number of pebs counter reset values in the DS structure. */ + unsigned char nr_counter_reset; + + /* Control bit-masks indexed by enum ds_feature: */ + unsigned long ctl[dsf_ctl_max]; }; -static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); +static struct ds_configuration ds_cfg __read_mostly; + + +/* Maximal size of a DS configuration: */ +#define MAX_SIZEOF_DS 0x80 -#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) +/* Maximal size of a BTS record: */ +#define MAX_SIZEOF_BTS (3 * 8) -#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */ -#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */ -#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ +/* BTS and PEBS buffer alignment: */ +#define DS_ALIGNMENT (1 << 3) -#define BTS_CONTROL \ - (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ - ds_cfg.ctl[dsf_bts_overflow]) +/* Number of buffer pointers in DS: */ +#define NUM_DS_PTR_FIELDS 8 +/* Size of a pebs reset value in DS: */ +#define PEBS_RESET_FIELD_SIZE 8 + +/* Mask of control bits in the DS MSR register: */ +#define BTS_CONTROL \ + ( ds_cfg.ctl[dsf_bts] | \ + ds_cfg.ctl[dsf_bts_kernel] | \ + ds_cfg.ctl[dsf_bts_user] | \ + ds_cfg.ctl[dsf_bts_overflow] ) /* * A BTS or PEBS tracer. @@ -66,29 +82,36 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); * to identify tracers. */ struct ds_tracer { - /* the DS context (partially) owned by this tracer */ - struct ds_context *context; - /* the buffer provided on ds_request() and its size in bytes */ - void *buffer; - size_t size; + /* The DS context (partially) owned by this tracer. */ + struct ds_context *context; + /* The buffer provided on ds_request() and its size in bytes. */ + void *buffer; + size_t size; }; struct bts_tracer { - /* the common DS part */ - struct ds_tracer ds; - /* the trace including the DS configuration */ - struct bts_trace trace; - /* buffer overflow notification function */ - bts_ovfl_callback_t ovfl; + /* The common DS part: */ + struct ds_tracer ds; + + /* The trace including the DS configuration: */ + struct bts_trace trace; + + /* Buffer overflow notification function: */ + bts_ovfl_callback_t ovfl; + + /* Active flags affecting trace collection. */ + unsigned int flags; }; struct pebs_tracer { - /* the common DS part */ - struct ds_tracer ds; - /* the trace including the DS configuration */ - struct pebs_trace trace; - /* buffer overflow notification function */ - pebs_ovfl_callback_t ovfl; + /* The common DS part: */ + struct ds_tracer ds; + + /* The trace including the DS configuration: */ + struct pebs_trace trace; + + /* Buffer overflow notification function: */ + pebs_ovfl_callback_t ovfl; }; /* @@ -97,6 +120,7 @@ struct pebs_tracer { * * The DS configuration consists of the following fields; different * architetures vary in the size of those fields. + * * - double-word aligned base linear address of the BTS buffer * - write pointer into the BTS buffer * - end linear address of the BTS buffer (one byte beyond the end of @@ -135,21 +159,22 @@ enum ds_field { }; enum ds_qualifier { - ds_bts = 0, + ds_bts = 0, ds_pebs }; -static inline unsigned long ds_get(const unsigned char *base, - enum ds_qualifier qual, enum ds_field field) +static inline unsigned long +ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field) { - base += (ds_cfg.sizeof_field * (field + (4 * qual))); + base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); return *(unsigned long *)base; } -static inline void ds_set(unsigned char *base, enum ds_qualifier qual, - enum ds_field field, unsigned long value) +static inline void +ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field, + unsigned long value) { - base += (ds_cfg.sizeof_field * (field + (4 * qual))); + base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); (*(unsigned long *)base) = value; } @@ -159,7 +184,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, */ static DEFINE_SPINLOCK(ds_lock); - /* * We either support (system-wide) per-cpu or per-thread allocation. * We distinguish the two based on the task_struct pointer, where a @@ -178,12 +202,28 @@ static DEFINE_SPINLOCK(ds_lock); */ static atomic_t tracers = ATOMIC_INIT(0); -static inline void get_tracer(struct task_struct *task) +static inline int get_tracer(struct task_struct *task) { - if (task) + int error; + + spin_lock_irq(&ds_lock); + + if (task) { + error = -EPERM; + if (atomic_read(&tracers) < 0) + goto out; atomic_inc(&tracers); - else + } else { + error = -EPERM; + if (atomic_read(&tracers) > 0) + goto out; atomic_dec(&tracers); + } + + error = 0; +out: + spin_unlock_irq(&ds_lock); + return error; } static inline void put_tracer(struct task_struct *task) @@ -194,14 +234,6 @@ static inline void put_tracer(struct task_struct *task) atomic_inc(&tracers); } -static inline int check_tracer(struct task_struct *task) -{ - return task ? - (atomic_read(&tracers) >= 0) : - (atomic_read(&tracers) <= 0); -} - - /* * The DS context is either attached to a thread or to a cpu: * - in the former case, the thread_struct contains a pointer to the @@ -213,61 +245,58 @@ static inline int check_tracer(struct task_struct *task) * deallocated when the last user puts the context. */ struct ds_context { - /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ - unsigned char ds[MAX_SIZEOF_DS]; - /* the owner of the BTS and PEBS configuration, respectively */ - struct bts_tracer *bts_master; - struct pebs_tracer *pebs_master; - /* use count */ - unsigned long count; - /* a pointer to the context location inside the thread_struct - * or the per_cpu context array */ - struct ds_context **this; - /* a pointer to the task owning this context, or NULL, if the - * context is owned by a cpu */ - struct task_struct *task; -}; + /* The DS configuration; goes into MSR_IA32_DS_AREA: */ + unsigned char ds[MAX_SIZEOF_DS]; + + /* The owner of the BTS and PEBS configuration, respectively: */ + struct bts_tracer *bts_master; + struct pebs_tracer *pebs_master; -static DEFINE_PER_CPU(struct ds_context *, system_context_array); + /* Use count: */ + unsigned long count; -#define system_context per_cpu(system_context_array, smp_processor_id()) + /* Pointer to the context pointer field: */ + struct ds_context **this; + + /* The traced task; NULL for cpu tracing: */ + struct task_struct *task; + + /* The traced cpu; only valid if task is NULL: */ + int cpu; +}; +static DEFINE_PER_CPU(struct ds_context *, cpu_context); -static inline struct ds_context *ds_get_context(struct task_struct *task) + +static struct ds_context *ds_get_context(struct task_struct *task, int cpu) { struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &system_context); + (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu)); struct ds_context *context = NULL; struct ds_context *new_context = NULL; - unsigned long irq; /* Chances are small that we already have a context. */ new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); if (!new_context) return NULL; - spin_lock_irqsave(&ds_lock, irq); + spin_lock_irq(&ds_lock); context = *p_context; - if (!context) { + if (likely(!context)) { context = new_context; context->this = p_context; context->task = task; + context->cpu = cpu; context->count = 0; - if (task) - set_tsk_thread_flag(task, TIF_DS_AREA_MSR); - - if (!task || (task == current)) - wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); - *p_context = context; } context->count++; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); if (context != new_context) kfree(new_context); @@ -275,8 +304,9 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) return context; } -static inline void ds_put_context(struct ds_context *context) +static void ds_put_context(struct ds_context *context) { + struct task_struct *task; unsigned long irq; if (!context) @@ -291,17 +321,55 @@ static inline void ds_put_context(struct ds_context *context) *(context->this) = NULL; - if (context->task) - clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); + task = context->task; + + if (task) + clear_tsk_thread_flag(task, TIF_DS_AREA_MSR); - if (!context->task || (context->task == current)) - wrmsrl(MSR_IA32_DS_AREA, 0); + /* + * We leave the (now dangling) pointer to the DS configuration in + * the DS_AREA msr. This is as good or as bad as replacing it with + * NULL - the hardware would crash if we enabled tracing. + * + * This saves us some problems with having to write an msr on a + * different cpu while preventing others from doing the same for the + * next context for that same cpu. + */ spin_unlock_irqrestore(&ds_lock, irq); + /* The context might still be in use for context switching. */ + if (task && (task != current)) + wait_task_context_switch(task); + kfree(context); } +static void ds_install_ds_area(struct ds_context *context) +{ + unsigned long ds; + + ds = (unsigned long)context->ds; + + /* + * There is a race between the bts master and the pebs master. + * + * The thread/cpu access is synchronized via get/put_cpu() for + * task tracing and via wrmsr_on_cpu for cpu tracing. + * + * If bts and pebs are collected for the same task or same cpu, + * the same confiuration is written twice. + */ + if (context->task) { + get_cpu(); + if (context->task == current) + wrmsrl(MSR_IA32_DS_AREA, ds); + set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); + put_cpu(); + } else + wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA, + (u32)((u64)ds), (u32)((u64)ds >> 32)); +} /* * Call the tracer's callback on a buffer overflow. @@ -332,9 +400,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) * The remainder of any partially written record is zeroed out. * * context: the DS context - * qual: the buffer type - * record: the data to write - * size: the size of the data + * qual: the buffer type + * record: the data to write + * size: the size of the data */ static int ds_write(struct ds_context *context, enum ds_qualifier qual, const void *record, size_t size) @@ -349,14 +417,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, unsigned long write_size, adj_write_size; /* - * write as much as possible without producing an + * Write as much as possible without producing an * overflow interrupt. * - * interrupt_threshold must either be + * Interrupt_threshold must either be * - bigger than absolute_maximum or * - point to a record between buffer_base and absolute_maximum * - * index points to a valid record. + * Index points to a valid record. */ base = ds_get(context->ds, qual, ds_buffer_base); index = ds_get(context->ds, qual, ds_index); @@ -365,8 +433,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, write_end = min(end, int_th); - /* if we are already beyond the interrupt threshold, - * we fill the entire buffer */ + /* + * If we are already beyond the interrupt threshold, + * we fill the entire buffer. + */ if (write_end <= index) write_end = end; @@ -383,7 +453,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; adj_write_size *= ds_cfg.sizeof_rec[qual]; - /* zero out trailing bytes */ + /* Zero out trailing bytes. */ memset((char *)index + write_size, 0, adj_write_size - write_size); index += adj_write_size; @@ -410,7 +480,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, * Later architectures use 64bit pointers throughout, whereas earlier * architectures use 32bit pointers in 32bit mode. * - * We compute the base address for the first 8 fields based on: + * We compute the base address for the fields based on: * - the field size stored in the DS configuration * - the relative field position * @@ -431,23 +501,23 @@ enum bts_field { bts_to, bts_flags, - bts_qual = bts_from, - bts_jiffies = bts_to, - bts_pid = bts_flags, + bts_qual = bts_from, + bts_clock = bts_to, + bts_pid = bts_flags, - bts_qual_mask = (bts_qual_max - 1), - bts_escape = ((unsigned long)-1 & ~bts_qual_mask) + bts_qual_mask = (bts_qual_max - 1), + bts_escape = ((unsigned long)-1 & ~bts_qual_mask) }; static inline unsigned long bts_get(const char *base, enum bts_field field) { - base += (ds_cfg.sizeof_field * field); + base += (ds_cfg.sizeof_ptr_field * field); return *(unsigned long *)base; } static inline void bts_set(char *base, enum bts_field field, unsigned long val) { - base += (ds_cfg.sizeof_field * field);; + base += (ds_cfg.sizeof_ptr_field * field);; (*(unsigned long *)base) = val; } @@ -463,8 +533,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val) * * return: bytes read/written on success; -Eerrno, otherwise */ -static int bts_read(struct bts_tracer *tracer, const void *at, - struct bts_struct *out) +static int +bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out) { if (!tracer) return -EINVAL; @@ -478,8 +548,8 @@ static int bts_read(struct bts_tracer *tracer, const void *at, memset(out, 0, sizeof(*out)); if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); - out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); - out->variant.timestamp.pid = bts_get(at, bts_pid); + out->variant.event.clock = bts_get(at, bts_clock); + out->variant.event.pid = bts_get(at, bts_pid); } else { out->qualifier = bts_branch; out->variant.lbr.from = bts_get(at, bts_from); @@ -516,8 +586,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in) case bts_task_arrives: case bts_task_departs: bts_set(raw, bts_qual, (bts_escape | in->qualifier)); - bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); - bts_set(raw, bts_pid, in->variant.timestamp.pid); + bts_set(raw, bts_clock, in->variant.event.clock); + bts_set(raw, bts_pid, in->variant.event.pid); break; default: return -EINVAL; @@ -555,7 +625,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, unsigned int flags) { unsigned long buffer, adj; - /* adjust the buffer address and size to meet alignment + /* + * Adjust the buffer address and size to meet alignment * constraints: * - buffer is double-word aligned * - size is multiple of record size @@ -577,9 +648,11 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, trace->begin = (void *)buffer; trace->top = trace->begin; trace->end = (void *)(buffer + size); - /* The value for 'no threshold' is -1, which will set the + /* + * The value for 'no threshold' is -1, which will set the * threshold outside of the buffer, just like we want it. */ + ith *= ds_cfg.sizeof_rec[qual]; trace->ith = (void *)(buffer + size - ith); trace->flags = flags; @@ -588,18 +661,27 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, enum ds_qualifier qual, struct task_struct *task, - void *base, size_t size, size_t th, unsigned int flags) + int cpu, void *base, size_t size, size_t th) { struct ds_context *context; int error; + size_t req_size; + + error = -EOPNOTSUPP; + if (!ds_cfg.sizeof_rec[qual]) + goto out; error = -EINVAL; if (!base) goto out; - /* we require some space to do alignment adjustments below */ + req_size = ds_cfg.sizeof_rec[qual]; + /* We might need space for alignment adjustments. */ + if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT)) + req_size += DS_ALIGNMENT; + error = -EINVAL; - if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) + if (size < req_size) goto out; if (th != (size_t)-1) { @@ -614,182 +696,318 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, tracer->size = size; error = -ENOMEM; - context = ds_get_context(task); + context = ds_get_context(task, cpu); if (!context) goto out; tracer->context = context; - ds_init_ds_trace(trace, qual, base, size, th, flags); + /* + * Defer any tracer-specific initialization work for the context until + * context ownership has been clarified. + */ error = 0; out: return error; } -struct bts_tracer *ds_request_bts(struct task_struct *task, - void *base, size_t size, - bts_ovfl_callback_t ovfl, size_t th, - unsigned int flags) +static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct bts_tracer *tracer; - unsigned long irq; int error; + /* Buffer overflow notification is not yet implemented. */ error = -EOPNOTSUPP; - if (!ds_cfg.ctl[dsf_bts]) + if (ovfl) goto out; - /* buffer overflow notification is not yet implemented */ - error = -EOPNOTSUPP; - if (ovfl) + error = get_tracer(task); + if (error < 0) goto out; error = -ENOMEM; tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); if (!tracer) - goto out; + goto out_put_tracer; tracer->ovfl = ovfl; + /* Do some more error checking and acquire a tracing context. */ error = ds_request(&tracer->ds, &tracer->trace.ds, - ds_bts, task, base, size, th, flags); + ds_bts, task, cpu, base, size, th); if (error < 0) goto out_tracer; - - spin_lock_irqsave(&ds_lock, irq); - - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; - get_tracer(task); + /* Claim the bts part of the tracing context we acquired above. */ + spin_lock_irq(&ds_lock); error = -EPERM; if (tracer->ds.context->bts_master) - goto out_put_tracer; + goto out_unlock; tracer->ds.context->bts_master = tracer; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); + /* + * Now that we own the bts part of the context, let's complete the + * initialization for that part. + */ + ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags); + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_install_ds_area(tracer->ds.context); tracer->trace.read = bts_read; tracer->trace.write = bts_write; - ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + /* Start tracing. */ ds_resume_bts(tracer); return tracer; - out_put_tracer: - put_tracer(task); out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); + out_put_tracer: + put_tracer(task); out: return ERR_PTR(error); } -struct pebs_tracer *ds_request_pebs(struct task_struct *task, - void *base, size_t size, - pebs_ovfl_callback_t ovfl, size_t th, - unsigned int flags) +struct bts_tracer *ds_request_bts_task(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_bts(task, 0, base, size, ovfl, th, flags); +} + +struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags); +} + +static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct pebs_tracer *tracer; - unsigned long irq; int error; - /* buffer overflow notification is not yet implemented */ + /* Buffer overflow notification is not yet implemented. */ error = -EOPNOTSUPP; if (ovfl) goto out; + error = get_tracer(task); + if (error < 0) + goto out; + error = -ENOMEM; tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); if (!tracer) - goto out; + goto out_put_tracer; tracer->ovfl = ovfl; + /* Do some more error checking and acquire a tracing context. */ error = ds_request(&tracer->ds, &tracer->trace.ds, - ds_pebs, task, base, size, th, flags); + ds_pebs, task, cpu, base, size, th); if (error < 0) goto out_tracer; - spin_lock_irqsave(&ds_lock, irq); - - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; - get_tracer(task); + /* Claim the pebs part of the tracing context we acquired above. */ + spin_lock_irq(&ds_lock); error = -EPERM; if (tracer->ds.context->pebs_master) - goto out_put_tracer; + goto out_unlock; tracer->ds.context->pebs_master = tracer; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); + /* + * Now that we own the pebs part of the context, let's complete the + * initialization for that part. + */ + ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags); ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); + ds_install_ds_area(tracer->ds.context); + + /* Start tracing. */ ds_resume_pebs(tracer); return tracer; - out_put_tracer: - put_tracer(task); out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); + out_put_tracer: + put_tracer(task); out: return ERR_PTR(error); } -void ds_release_bts(struct bts_tracer *tracer) +struct pebs_tracer *ds_request_pebs_task(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags) { - if (!tracer) - return; + return ds_request_pebs(task, 0, base, size, ovfl, th, flags); +} - ds_suspend_bts(tracer); +struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags); +} + +static void ds_free_bts(struct bts_tracer *tracer) +{ + struct task_struct *task; + + task = tracer->ds.context->task; WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); tracer->ds.context->bts_master = NULL; - put_tracer(tracer->ds.context->task); + /* Make sure tracing stopped and the tracer is not in use. */ + if (task && (task != current)) + wait_task_context_switch(task); + ds_put_context(tracer->ds.context); + put_tracer(task); kfree(tracer); } +void ds_release_bts(struct bts_tracer *tracer) +{ + might_sleep(); + + if (!tracer) + return; + + ds_suspend_bts(tracer); + ds_free_bts(tracer); +} + +int ds_release_bts_noirq(struct bts_tracer *tracer) +{ + struct task_struct *task; + unsigned long irq; + int error; + + if (!tracer) + return 0; + + task = tracer->ds.context->task; + + local_irq_save(irq); + + error = -EPERM; + if (!task && + (tracer->ds.context->cpu != smp_processor_id())) + goto out; + + error = -EPERM; + if (task && (task != current)) + goto out; + + ds_suspend_bts_noirq(tracer); + ds_free_bts(tracer); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static void update_task_debugctlmsr(struct task_struct *task, + unsigned long debugctlmsr) +{ + task->thread.debugctlmsr = debugctlmsr; + + get_cpu(); + if (task == current) + update_debugctlmsr(debugctlmsr); + put_cpu(); +} + void ds_suspend_bts(struct bts_tracer *tracer) { struct task_struct *task; + unsigned long debugctlmsr; + int cpu; if (!tracer) return; + tracer->flags = 0; + task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; - if (!task || (task == current)) - update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL); + WARN_ON(!task && irqs_disabled()); - if (task) { - task->thread.debugctlmsr &= ~BTS_CONTROL; + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr_on_cpu(cpu)); + debugctlmsr &= ~BTS_CONTROL; - if (!task->thread.debugctlmsr) - clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); - } + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr_on_cpu(cpu, debugctlmsr); } -void ds_resume_bts(struct bts_tracer *tracer) +int ds_suspend_bts_noirq(struct bts_tracer *tracer) { struct task_struct *task; - unsigned long control; + unsigned long debugctlmsr, irq; + int cpu, error = 0; if (!tracer) - return; + return 0; + + tracer->flags = 0; task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; + + local_irq_save(irq); + + error = -EPERM; + if (!task && (cpu != smp_processor_id())) + goto out; + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr()); + debugctlmsr &= ~BTS_CONTROL; + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr(debugctlmsr); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static unsigned long ds_bts_control(struct bts_tracer *tracer) +{ + unsigned long control; control = ds_cfg.ctl[dsf_bts]; if (!(tracer->trace.ds.flags & BTS_KERNEL)) @@ -797,41 +1015,149 @@ void ds_resume_bts(struct bts_tracer *tracer) if (!(tracer->trace.ds.flags & BTS_USER)) control |= ds_cfg.ctl[dsf_bts_user]; - if (task) { - task->thread.debugctlmsr |= control; - set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); - } - - if (!task || (task == current)) - update_debugctlmsr(get_debugctlmsr() | control); + return control; } -void ds_release_pebs(struct pebs_tracer *tracer) +void ds_resume_bts(struct bts_tracer *tracer) { + struct task_struct *task; + unsigned long debugctlmsr; + int cpu; + if (!tracer) return; - ds_suspend_pebs(tracer); + tracer->flags = tracer->trace.ds.flags; + + task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; + + WARN_ON(!task && irqs_disabled()); + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr_on_cpu(cpu)); + debugctlmsr |= ds_bts_control(tracer); + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr_on_cpu(cpu, debugctlmsr); +} + +int ds_resume_bts_noirq(struct bts_tracer *tracer) +{ + struct task_struct *task; + unsigned long debugctlmsr, irq; + int cpu, error = 0; + + if (!tracer) + return 0; + + tracer->flags = tracer->trace.ds.flags; + + task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; + + local_irq_save(irq); + + error = -EPERM; + if (!task && (cpu != smp_processor_id())) + goto out; + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr()); + debugctlmsr |= ds_bts_control(tracer); + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr(debugctlmsr); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static void ds_free_pebs(struct pebs_tracer *tracer) +{ + struct task_struct *task; + + task = tracer->ds.context->task; WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); tracer->ds.context->pebs_master = NULL; - put_tracer(tracer->ds.context->task); ds_put_context(tracer->ds.context); + put_tracer(task); kfree(tracer); } +void ds_release_pebs(struct pebs_tracer *tracer) +{ + might_sleep(); + + if (!tracer) + return; + + ds_suspend_pebs(tracer); + ds_free_pebs(tracer); +} + +int ds_release_pebs_noirq(struct pebs_tracer *tracer) +{ + struct task_struct *task; + unsigned long irq; + int error; + + if (!tracer) + return 0; + + task = tracer->ds.context->task; + + local_irq_save(irq); + + error = -EPERM; + if (!task && + (tracer->ds.context->cpu != smp_processor_id())) + goto out; + + error = -EPERM; + if (task && (task != current)) + goto out; + + ds_suspend_pebs_noirq(tracer); + ds_free_pebs(tracer); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + void ds_suspend_pebs(struct pebs_tracer *tracer) { } +int ds_suspend_pebs_noirq(struct pebs_tracer *tracer) +{ + return 0; +} + void ds_resume_pebs(struct pebs_tracer *tracer) { } +int ds_resume_pebs_noirq(struct pebs_tracer *tracer) +{ + return 0; +} + const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) { if (!tracer) @@ -847,8 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) return NULL; ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); - tracer->trace.reset_value = - *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); + + tracer->trace.counters = ds_cfg.nr_counter_reset; + memcpy(tracer->trace.counter_reset, + tracer->ds.context->ds + + (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field), + ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE); return &tracer->trace; } @@ -873,18 +1203,24 @@ int ds_reset_pebs(struct pebs_tracer *tracer) tracer->trace.ds.top = tracer->trace.ds.begin; - ds_set(tracer->ds.context->ds, ds_bts, ds_index, + ds_set(tracer->ds.context->ds, ds_pebs, ds_index, (unsigned long)tracer->trace.ds.top); return 0; } -int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) +int ds_set_pebs_reset(struct pebs_tracer *tracer, + unsigned int counter, u64 value) { if (!tracer) return -EINVAL; - *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; + if (ds_cfg.nr_counter_reset < counter) + return -EINVAL; + + *(u64 *)(tracer->ds.context->ds + + (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) + + (counter * PEBS_RESET_FIELD_SIZE)) = value; return 0; } @@ -894,73 +1230,117 @@ static const struct ds_configuration ds_cfg_netburst = { .ctl[dsf_bts] = (1 << 2) | (1 << 3), .ctl[dsf_bts_kernel] = (1 << 5), .ctl[dsf_bts_user] = (1 << 6), - - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, -#ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10, -#else - .sizeof_rec[ds_pebs] = sizeof(long) * 18, -#endif + .nr_counter_reset = 1, }; static const struct ds_configuration ds_cfg_pentium_m = { .name = "Pentium M", .ctl[dsf_bts] = (1 << 6) | (1 << 7), - - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, -#ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10, -#else - .sizeof_rec[ds_pebs] = sizeof(long) * 18, -#endif + .nr_counter_reset = 1, }; static const struct ds_configuration ds_cfg_core2_atom = { .name = "Core 2/Atom", .ctl[dsf_bts] = (1 << 6) | (1 << 7), .ctl[dsf_bts_kernel] = (1 << 9), .ctl[dsf_bts_user] = (1 << 10), - - .sizeof_field = 8, - .sizeof_rec[ds_bts] = 8 * 3, - .sizeof_rec[ds_pebs] = 8 * 18, + .nr_counter_reset = 1, +}; +static const struct ds_configuration ds_cfg_core_i7 = { + .name = "Core i7", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .ctl[dsf_bts_kernel] = (1 << 9), + .ctl[dsf_bts_user] = (1 << 10), + .nr_counter_reset = 4, }; static void -ds_configure(const struct ds_configuration *cfg) +ds_configure(const struct ds_configuration *cfg, + struct cpuinfo_x86 *cpu) { + unsigned long nr_pebs_fields = 0; + + printk(KERN_INFO "[ds] using %s configuration\n", cfg->name); + +#ifdef __i386__ + nr_pebs_fields = 10; +#else + nr_pebs_fields = 18; +#endif + + /* + * Starting with version 2, architectural performance + * monitoring supports a format specifier. + */ + if ((cpuid_eax(0xa) & 0xff) > 1) { + unsigned long perf_capabilities, format; + + rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities); + + format = (perf_capabilities >> 8) & 0xf; + + switch (format) { + case 0: + nr_pebs_fields = 18; + break; + case 1: + nr_pebs_fields = 22; + break; + default: + printk(KERN_INFO + "[ds] unknown PEBS format: %lu\n", format); + nr_pebs_fields = 0; + break; + } + } + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; - printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); + ds_cfg.sizeof_ptr_field = + (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4); + + ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3; + ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields; - if (!cpu_has_bts) { - ds_cfg.ctl[dsf_bts] = 0; + if (!cpu_has(cpu, X86_FEATURE_BTS)) { + ds_cfg.sizeof_rec[ds_bts] = 0; printk(KERN_INFO "[ds] bts not available\n"); } - if (!cpu_has_pebs) + if (!cpu_has(cpu, X86_FEATURE_PEBS)) { + ds_cfg.sizeof_rec[ds_pebs] = 0; printk(KERN_INFO "[ds] pebs not available\n"); + } + + printk(KERN_INFO "[ds] sizes: address: %u bit, ", + 8 * ds_cfg.sizeof_ptr_field); + printk("bts/pebs record: %u/%u bytes\n", + ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); - WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); + WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) { + /* Only configure the first cpu. Others are identical. */ + if (ds_cfg.name) + return; + switch (c->x86) { case 0x6: switch (c->x86_model) { case 0x9: case 0xd: /* Pentium M */ - ds_configure(&ds_cfg_pentium_m); + ds_configure(&ds_cfg_pentium_m, c); break; case 0xf: case 0x17: /* Core2 */ case 0x1c: /* Atom */ - ds_configure(&ds_cfg_core2_atom); + ds_configure(&ds_cfg_core2_atom, c); + break; + case 0x1a: /* Core i7 */ + ds_configure(&ds_cfg_core_i7, c); break; - case 0x1a: /* i7 */ default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } break; @@ -969,64 +1349,89 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_netburst); + ds_configure(&ds_cfg_netburst, c); break; default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } break; default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } } +static inline void ds_take_timestamp(struct ds_context *context, + enum bts_qualifier qualifier, + struct task_struct *task) +{ + struct bts_tracer *tracer = context->bts_master; + struct bts_struct ts; + + /* Prevent compilers from reading the tracer pointer twice. */ + barrier(); + + if (!tracer || !(tracer->flags & BTS_TIMESTAMPS)) + return; + + memset(&ts, 0, sizeof(ts)); + ts.qualifier = qualifier; + ts.variant.event.clock = trace_clock_global(); + ts.variant.event.pid = task->pid; + + bts_write(tracer, &ts); +} + /* * Change the DS configuration from tracing prev to tracing next. */ void ds_switch_to(struct task_struct *prev, struct task_struct *next) { - struct ds_context *prev_ctx = prev->thread.ds_ctx; - struct ds_context *next_ctx = next->thread.ds_ctx; + struct ds_context *prev_ctx = prev->thread.ds_ctx; + struct ds_context *next_ctx = next->thread.ds_ctx; + unsigned long debugctlmsr = next->thread.debugctlmsr; + + /* Make sure all data is read before we start. */ + barrier(); if (prev_ctx) { update_debugctlmsr(0); - if (prev_ctx->bts_master && - (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { - struct bts_struct ts = { - .qualifier = bts_task_departs, - .variant.timestamp.jiffies = jiffies_64, - .variant.timestamp.pid = prev->pid - }; - bts_write(prev_ctx->bts_master, &ts); - } + ds_take_timestamp(prev_ctx, bts_task_departs, prev); } if (next_ctx) { - if (next_ctx->bts_master && - (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { - struct bts_struct ts = { - .qualifier = bts_task_arrives, - .variant.timestamp.jiffies = jiffies_64, - .variant.timestamp.pid = next->pid - }; - bts_write(next_ctx->bts_master, &ts); - } + ds_take_timestamp(next_ctx, bts_task_arrives, next); wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); } - update_debugctlmsr(next->thread.debugctlmsr); + update_debugctlmsr(debugctlmsr); } -void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) +static __init int ds_selftest(void) { - clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); - tsk->thread.ds_ctx = NULL; -} + if (ds_cfg.sizeof_rec[ds_bts]) { + int error; -void ds_exit_thread(struct task_struct *tsk) -{ + error = ds_selftest_bts(); + if (error) { + WARN(1, "[ds] selftest failed. disabling bts.\n"); + ds_cfg.sizeof_rec[ds_bts] = 0; + } + } + + if (ds_cfg.sizeof_rec[ds_pebs]) { + int error; + + error = ds_selftest_pebs(); + if (error) { + WARN(1, "[ds] selftest failed. disabling pebs.\n"); + ds_cfg.sizeof_rec[ds_pebs] = 0; + } + } + + return 0; } +device_initcall(ds_selftest); diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c new file mode 100644 index 00000000000..6bc7c199ab9 --- /dev/null +++ b/arch/x86/kernel/ds_selftest.c @@ -0,0 +1,408 @@ +/* + * Debug Store support - selftest + * + * + * Copyright (C) 2009 Intel Corporation. + * Markus Metzger <markus.t.metzger@intel.com>, 2009 + */ + +#include "ds_selftest.h" + +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/smp.h> +#include <linux/cpu.h> + +#include <asm/ds.h> + + +#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ +#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */ + +struct ds_selftest_bts_conf { + struct bts_tracer *tracer; + int error; + int (*suspend)(struct bts_tracer *); + int (*resume)(struct bts_tracer *); +}; + +static int ds_selftest_bts_consistency(const struct bts_trace *trace) +{ + int error = 0; + + if (!trace) { + printk(KERN_CONT "failed to access trace..."); + /* Bail out. Other tests are pointless. */ + return -1; + } + + if (!trace->read) { + printk(KERN_CONT "bts read not available..."); + error = -1; + } + + /* Do some sanity checks on the trace configuration. */ + if (!trace->ds.n) { + printk(KERN_CONT "empty bts buffer..."); + error = -1; + } + if (!trace->ds.size) { + printk(KERN_CONT "bad bts trace setup..."); + error = -1; + } + if (trace->ds.end != + (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) { + printk(KERN_CONT "bad bts buffer setup..."); + error = -1; + } + /* + * We allow top in [begin; end], since its not clear when the + * overflow adjustment happens: after the increment or before the + * write. + */ + if ((trace->ds.top < trace->ds.begin) || + (trace->ds.end < trace->ds.top)) { + printk(KERN_CONT "bts top out of bounds..."); + error = -1; + } + + return error; +} + +static int ds_selftest_bts_read(struct bts_tracer *tracer, + const struct bts_trace *trace, + const void *from, const void *to) +{ + const unsigned char *at; + + /* + * Check a few things which do not belong to this test. + * They should be covered by other tests. + */ + if (!trace) + return -1; + + if (!trace->read) + return -1; + + if (to < from) + return -1; + + if (from < trace->ds.begin) + return -1; + + if (trace->ds.end < to) + return -1; + + if (!trace->ds.size) + return -1; + + /* Now to the test itself. */ + for (at = from; (void *)at < to; at += trace->ds.size) { + struct bts_struct bts; + unsigned long index; + int error; + + if (((void *)at - trace->ds.begin) % trace->ds.size) { + printk(KERN_CONT + "read from non-integer index..."); + return -1; + } + index = ((void *)at - trace->ds.begin) / trace->ds.size; + + memset(&bts, 0, sizeof(bts)); + error = trace->read(tracer, at, &bts); + if (error < 0) { + printk(KERN_CONT + "error reading bts trace at [%lu] (0x%p)...", + index, at); + return error; + } + + switch (bts.qualifier) { + case BTS_BRANCH: + break; + default: + printk(KERN_CONT + "unexpected bts entry %llu at [%lu] (0x%p)...", + bts.qualifier, index, at); + return -1; + } + } + + return 0; +} + +static void ds_selftest_bts_cpu(void *arg) +{ + struct ds_selftest_bts_conf *conf = arg; + const struct bts_trace *trace; + void *top; + + if (IS_ERR(conf->tracer)) { + conf->error = PTR_ERR(conf->tracer); + conf->tracer = NULL; + + printk(KERN_CONT + "initialization failed (err: %d)...", conf->error); + return; + } + + /* We should meanwhile have enough trace. */ + conf->error = conf->suspend(conf->tracer); + if (conf->error < 0) + return; + + /* Let's see if we can access the trace. */ + trace = ds_read_bts(conf->tracer); + + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; + + /* If everything went well, we should have a few trace entries. */ + if (trace->ds.top == trace->ds.begin) { + /* + * It is possible but highly unlikely that we got a + * buffer overflow and end up at exactly the same + * position we started from. + * Let's issue a warning, but continue. + */ + printk(KERN_CONT "no trace/overflow..."); + } + + /* Let's try to read the trace we collected. */ + conf->error = + ds_selftest_bts_read(conf->tracer, trace, + trace->ds.begin, trace->ds.top); + if (conf->error < 0) + return; + + /* + * Let's read the trace again. + * Since we suspended tracing, we should get the same result. + */ + top = trace->ds.top; + + trace = ds_read_bts(conf->tracer); + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; + + if (top != trace->ds.top) { + printk(KERN_CONT "suspend not working..."); + conf->error = -1; + return; + } + + /* Let's collect some more trace - see if resume is working. */ + conf->error = conf->resume(conf->tracer); + if (conf->error < 0) + return; + + conf->error = conf->suspend(conf->tracer); + if (conf->error < 0) + return; + + trace = ds_read_bts(conf->tracer); + + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; + + if (trace->ds.top == top) { + /* + * It is possible but highly unlikely that we got a + * buffer overflow and end up at exactly the same + * position we started from. + * Let's issue a warning and check the full trace. + */ + printk(KERN_CONT + "no resume progress/overflow..."); + + conf->error = + ds_selftest_bts_read(conf->tracer, trace, + trace->ds.begin, trace->ds.end); + } else if (trace->ds.top < top) { + /* + * We had a buffer overflow - the entire buffer should + * contain trace records. + */ + conf->error = + ds_selftest_bts_read(conf->tracer, trace, + trace->ds.begin, trace->ds.end); + } else { + /* + * It is quite likely that the buffer did not overflow. + * Let's just check the delta trace. + */ + conf->error = + ds_selftest_bts_read(conf->tracer, trace, top, + trace->ds.top); + } + if (conf->error < 0) + return; + + conf->error = 0; +} + +static int ds_suspend_bts_wrap(struct bts_tracer *tracer) +{ + ds_suspend_bts(tracer); + return 0; +} + +static int ds_resume_bts_wrap(struct bts_tracer *tracer) +{ + ds_resume_bts(tracer); + return 0; +} + +static void ds_release_bts_noirq_wrap(void *tracer) +{ + (void)ds_release_bts_noirq(tracer); +} + +static int ds_selftest_bts_bad_release_noirq(int cpu, + struct bts_tracer *tracer) +{ + int error = -EPERM; + + /* Try to release the tracer on the wrong cpu. */ + get_cpu(); + if (cpu != smp_processor_id()) { + error = ds_release_bts_noirq(tracer); + if (error != -EPERM) + printk(KERN_CONT "release on wrong cpu..."); + } + put_cpu(); + + return error ? 0 : -1; +} + +static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer) +{ + struct bts_tracer *tracer; + int error; + + /* Try to request cpu tracing while task tracing is active. */ + tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL, + (size_t)-1, BTS_KERNEL); + error = PTR_ERR(tracer); + if (!IS_ERR(tracer)) { + ds_release_bts(tracer); + error = 0; + } + + if (error != -EPERM) + printk(KERN_CONT "cpu/task tracing overlap..."); + + return error ? 0 : -1; +} + +static int ds_selftest_bts_bad_request_task(void *buffer) +{ + struct bts_tracer *tracer; + int error; + + /* Try to request cpu tracing while task tracing is active. */ + tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL, + (size_t)-1, BTS_KERNEL); + error = PTR_ERR(tracer); + if (!IS_ERR(tracer)) { + error = 0; + ds_release_bts(tracer); + } + + if (error != -EPERM) + printk(KERN_CONT "task/cpu tracing overlap..."); + + return error ? 0 : -1; +} + +int ds_selftest_bts(void) +{ + struct ds_selftest_bts_conf conf; + unsigned char buffer[BUFFER_SIZE], *small_buffer; + unsigned long irq; + int cpu; + + printk(KERN_INFO "[ds] bts selftest..."); + conf.error = 0; + + small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8; + + get_online_cpus(); + for_each_online_cpu(cpu) { + conf.suspend = ds_suspend_bts_wrap; + conf.resume = ds_resume_bts_wrap; + conf.tracer = + ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_task(buffer); + ds_release_bts(conf.tracer); + if (conf.error < 0) + goto out; + + conf.suspend = ds_suspend_bts_noirq; + conf.resume = ds_resume_bts_noirq; + conf.tracer = + ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1); + if (conf.error >= 0) { + conf.error = + ds_selftest_bts_bad_release_noirq(cpu, + conf.tracer); + /* We must not release the tracer twice. */ + if (conf.error < 0) + conf.tracer = NULL; + } + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_task(buffer); + smp_call_function_single(cpu, ds_release_bts_noirq_wrap, + conf.tracer, 1); + if (conf.error < 0) + goto out; + } + + conf.suspend = ds_suspend_bts_wrap; + conf.resume = ds_resume_bts_wrap; + conf.tracer = + ds_request_bts_task(current, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); + ds_release_bts(conf.tracer); + if (conf.error < 0) + goto out; + + conf.suspend = ds_suspend_bts_noirq; + conf.resume = ds_resume_bts_noirq; + conf.tracer = + ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + local_irq_save(irq); + ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); + ds_release_bts_noirq(conf.tracer); + local_irq_restore(irq); + if (conf.error < 0) + goto out; + + conf.error = 0; + out: + put_online_cpus(); + printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed")); + + return conf.error; +} + +int ds_selftest_pebs(void) +{ + return 0; +} diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h new file mode 100644 index 00000000000..2ba8745c666 --- /dev/null +++ b/arch/x86/kernel/ds_selftest.h @@ -0,0 +1,15 @@ +/* + * Debug Store support - selftest + * + * + * Copyright (C) 2009 Intel Corporation. + * Markus Metzger <markus.t.metzger@intel.com>, 2009 + */ + +#ifdef CONFIG_X86_DS_SELFTEST +extern int ds_selftest_bts(void); +extern int ds_selftest_pebs(void); +#else +static inline int ds_selftest_bts(void) { return 0; } +static inline int ds_selftest_pebs(void) { return 0; } +#endif diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 95ea5fa7d44..c8405718a4c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -22,6 +22,7 @@ #include "dumpstack.h" int panic_on_unrecovered_nmi; +int panic_on_io_nmi; unsigned int code_bytes = 64; int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index da87590b869..81086c227ab 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl); extern unsigned int code_bytes; -extern int kstack_depth_to_print; /* The form of the top of the frame on the stack */ struct stack_frame { diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index d593cd1f58d..bca5fba91c9 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -19,6 +19,12 @@ #include "dumpstack.h" +/* Just a stub for now */ +int x86_is_stack_id(int id, char *name) +{ + return 0; +} + void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index d35db5993fd..54b0a327676 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -19,10 +19,8 @@ #include "dumpstack.h" -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) -{ - static char ids[][8] = { + +static char x86_stack_ids[][8] = { [DEBUG_STACK - 1] = "#DB", [NMI_STACK - 1] = "NMI", [DOUBLEFAULT_STACK - 1] = "#DF", @@ -33,6 +31,15 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" #endif }; + +int x86_is_stack_id(int id, char *name) +{ + return x86_stack_ids[id - 1] == name; +} + +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, + unsigned *usedp, char **idp) +{ unsigned k; /* @@ -61,7 +68,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, if (*usedp & (1U << k)) break; *usedp |= 1U << k; - *idp = ids[k]; + *idp = x86_stack_ids[k]; return (unsigned long *)end; } /* @@ -81,12 +88,13 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, do { ++j; end -= EXCEPTION_STKSZ; - ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); + x86_stack_ids[j][4] = '1' + + (j - N_EXCEPTION_STACKS); } while (stack < end - EXCEPTION_STKSZ); if (*usedp & (1U << j)) break; *usedp |= 1U << j; - *idp = ids[j]; + *idp = x86_stack_ids[j]; return (unsigned long *)end; } #endif diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 00628130292..5cb5725b2ba 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, */ __init void e820_setup_gap(void) { - unsigned long gapstart, gapsize, round; + unsigned long gapstart, gapsize; int found; gapstart = 0x10000000; @@ -627,22 +627,16 @@ __init void e820_setup_gap(void) #ifdef CONFIG_X86_64 if (!found) { gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " - "address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource " - "registers may break!\n"); + printk(KERN_ERR + "PCI: Warning: Cannot find a gap in the 32bit address range\n" + "PCI: Unassigned devices with 32bit resource registers may break!\n"); } #endif /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. + * e820_reserve_resources_late protect stolen RAM already */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; + pci_mem_start = gapstart; printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", @@ -1371,6 +1365,25 @@ void __init e820_reserve_resources(void) } } +/* How much should we pad RAM ending depending on where it is? */ +static unsigned long ram_alignment(resource_size_t pos) +{ + unsigned long mb = pos >> 20; + + /* To 64kB in the first megabyte */ + if (!mb) + return 64*1024; + + /* To 1MB in the first 16MB */ + if (mb < 16) + return 1024*1024; + + /* To 32MB for anything above that */ + return 32*1024*1024; +} + +#define MAX_RESOURCE_SIZE ((resource_size_t)-1) + void __init e820_reserve_resources_late(void) { int i; @@ -1382,6 +1395,26 @@ void __init e820_reserve_resources_late(void) insert_resource_expand_to_fit(&iomem_resource, res); res++; } + + /* + * Try to bump up RAM regions to reasonable boundaries to + * avoid stolen RAM: + */ + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *entry = &e820.map[i]; + u64 start, end; + + if (entry->type != E820_RAM) + continue; + start = entry->addr + entry->size; + end = round_up(start, ram_alignment(start)) - 1; + if (end > MAX_RESOURCE_SIZE) + end = MAX_RESOURCE_SIZE; + if (start >= end) + continue; + reserve_region_with_split(&iomem_resource, start, end, + "RAM buffer"); + } } char *__init default_machine_specific_memory_setup(void) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 76b8cd953de..ebdb85cf268 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, int slot, int func) } #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) static u32 __init ati_ixp4x0_rev(int num, int slot, int func) { u32 d; @@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func) d &= 0xff; return d; } +#endif static void __init ati_bugs(int num, int slot, int func) { diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 1736acc4d7a..fe26ba3e345 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -240,10 +240,35 @@ static void __init do_add_efi_memmap(void) unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; int e820_type; - if (md->attribute & EFI_MEMORY_WB) - e820_type = E820_RAM; - else + switch (md->type) { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + if (md->attribute & EFI_MEMORY_WB) + e820_type = E820_RAM; + else + e820_type = E820_RESERVED; + break; + case EFI_ACPI_RECLAIM_MEMORY: + e820_type = E820_ACPI; + break; + case EFI_ACPI_MEMORY_NVS: + e820_type = E820_NVS; + break; + case EFI_UNUSABLE_MEMORY: + e820_type = E820_UNUSABLE; + break; + default: + /* + * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE + * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO + * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE + */ e820_type = E820_RESERVED; + break; + } e820_add_region(start, size, e820_type); } sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); @@ -329,7 +354,7 @@ void __init efi_init(void) */ c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); if (c16) { - for (i = 0; i < sizeof(vendor) && *c16; ++i) + for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i) vendor[i] = *c16++; vendor[i] = '\0'; } else @@ -487,7 +512,7 @@ void __init efi_enter_virtual_mode(void) && end_pfn <= max_pfn_mapped)) va = __va(md->phys_addr); else - va = efi_ioremap(md->phys_addr, size); + va = efi_ioremap(md->phys_addr, size, md->type); md->virt_addr = (u64) (unsigned long) va; diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index 22c3b7828c5..ac0621a7ac3 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -98,10 +98,14 @@ void __init efi_call_phys_epilog(void) early_runtime_code_mapping_set_exec(0); } -void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) +void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, + u32 type) { unsigned long last_map_pfn; + if (type == EFI_MEMORY_MAPPED_IO) + return ioremap(phys_addr, size); + last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) return NULL; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c929add475c..c097e7d607c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -48,7 +48,6 @@ #include <asm/segment.h> #include <asm/smp.h> #include <asm/page_types.h> -#include <asm/desc.h> #include <asm/percpu.h> #include <asm/dwarf2.h> #include <asm/processor-flags.h> @@ -84,7 +83,7 @@ #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else #define preempt_stop(clobbers) -#define resume_kernel restore_nocheck +#define resume_kernel restore_all #endif .macro TRACE_IRQS_IRET @@ -372,7 +371,7 @@ END(ret_from_exception) ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? - jnz restore_nocheck + jnz restore_all need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl @@ -540,6 +539,8 @@ syscall_exit: jne syscall_exit_work restore_all: + TRACE_IRQS_IRET +restore_all_notrace: movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS # Warning: PT_OLDSS(%esp) contains the wrong/random values if we # are returning to the kernel. @@ -551,8 +552,6 @@ restore_all: CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS restore_nocheck: - TRACE_IRQS_IRET -restore_nocheck_notrace: RESTORE_REGS 4 # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 irq_return: @@ -588,22 +587,34 @@ ldt_ss: jne restore_nocheck #endif - /* If returning to userspace with 16bit stack, - * try to fix the higher word of ESP, as the CPU - * won't restore it. - * This is an "official" bug of all the x86-compatible - * CPUs, which we can try to work around to make - * dosemu and wine happy. */ - movl PT_OLDESP(%esp), %eax - movl %esp, %edx - call patch_espfix_desc +/* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + PER_CPU(gdt_page, %ebx) + shr $16, %edx + mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ + mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ pushl $__ESPFIX_SS CFI_ADJUST_CFA_OFFSET 4 - pushl %eax + push %eax /* new kernel esp */ CFI_ADJUST_CFA_OFFSET 4 + /* Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the iret */ DISABLE_INTERRUPTS(CLBR_EAX) - TRACE_IRQS_OFF - lss (%esp), %esp + lss (%esp), %esp /* switch to espfix segment */ CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck CFI_ENDPROC @@ -716,15 +727,24 @@ PTREGSCALL(vm86) PTREGSCALL(vm86old) .macro FIXUP_ESPFIX_STACK - /* since we are on a wrong stack, we cant make it a C code :( */ +/* + * Switch back for ESPFIX stack to the normal zerobased stack + * + * We can't call C functions using the ESPFIX stack. This code reads + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ + /* fixup the stack */ PER_CPU(gdt_page, %ebx) - GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) - addl %esp, %eax + mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ + mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ + shl $16, %eax + addl %esp, %eax /* the adjusted stack pointer */ pushl $__KERNEL_DS CFI_ADJUST_CFA_OFFSET 4 pushl %eax CFI_ADJUST_CFA_OFFSET 4 - lss (%esp), %esp + lss (%esp), %esp /* switch to the normal stack segment */ CFI_ADJUST_CFA_OFFSET -8 .endm .macro UNWIND_ESPFIX_STACK @@ -1154,6 +1174,7 @@ ENTRY(ftrace_graph_caller) pushl %edx movl 0xc(%esp), %edx lea 0x4(%ebp), %eax + movl (%ebp), %ecx subl $MCOUNT_INSN_SIZE, %edx call prepare_ftrace_return popl %edx @@ -1168,6 +1189,7 @@ return_to_handler: pushl %eax pushl %ecx pushl %edx + movl %ebp, %eax call ftrace_return_to_handler movl %eax, 0xc(%esp) popl %edx @@ -1329,7 +1351,7 @@ nmi_stack_correct: xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi - jmp restore_nocheck_notrace + jmp restore_all_notrace CFI_ENDPROC nmi_stack_fixup: diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 38946c6e843..c251be74510 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -135,6 +135,7 @@ ENTRY(ftrace_graph_caller) leaq 8(%rbp), %rdi movq 0x38(%rsp), %rsi + movq (%rbp), %rdx subq $MCOUNT_INSN_SIZE, %rsi call prepare_ftrace_return @@ -147,27 +148,15 @@ END(ftrace_graph_caller) GLOBAL(return_to_handler) subq $80, %rsp + /* Save the return values */ movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) - movq %r10, 56(%rsp) - movq %r11, 64(%rsp) + movq %rdx, 8(%rsp) + movq %rbp, %rdi call ftrace_return_to_handler movq %rax, 72(%rsp) - movq 64(%rsp), %r11 - movq 56(%rsp), %r10 - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx + movq 8(%rsp), %rdx movq (%rsp), %rax addq $72, %rsp retq @@ -976,6 +965,8 @@ END(\sym) #ifdef CONFIG_SMP apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt REBOOT_VECTOR \ + reboot_interrupt smp_reboot_interrupt #endif #ifdef CONFIG_X86_UV @@ -1007,10 +998,15 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ #endif apicinterrupt THRESHOLD_APIC_VECTOR \ - threshold_interrupt mce_threshold_interrupt + threshold_interrupt smp_threshold_interrupt apicinterrupt THERMAL_APIC_VECTOR \ thermal_interrupt smp_thermal_interrupt +#ifdef CONFIG_X86_MCE +apicinterrupt MCE_SELF_VECTOR \ + mce_self_interrupt smp_mce_self_interrupt +#endif + #ifdef CONFIG_SMP apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ call_function_single_interrupt smp_call_function_single_interrupt @@ -1025,6 +1021,11 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt +#ifdef CONFIG_PERF_COUNTERS +apicinterrupt LOCAL_PENDING_VECTOR \ + perf_pending_interrupt smp_perf_pending_interrupt +#endif + /* * Exception entry points. */ @@ -1379,10 +1380,15 @@ END(xen_failsafe_callback) paranoidzeroentry_ist debug do_debug DEBUG_STACK paranoidzeroentry_ist int3 do_int3 DEBUG_STACK paranoiderrorentry stack_segment do_stack_segment +#ifdef CONFIG_XEN +zeroentry xen_debug do_debug +zeroentry xen_int3 do_int3 +errorentry xen_stack_segment do_stack_segment +#endif errorentry general_protection do_general_protection errorentry page_fault do_page_fault #ifdef CONFIG_X86_MCE -paranoidzeroentry machine_check do_machine_check +paranoidzeroentry machine_check *machine_check_vector(%rip) #endif /* diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index b79c5533c42..d94e1ea3b9f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -408,7 +408,8 @@ int ftrace_disable_ftrace_graph_caller(void) * Hook the return address and push it in the stack of return addrs * in current thread info. */ -void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, + unsigned long frame_pointer) { unsigned long old; int faulted; @@ -453,7 +454,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) { + if (ftrace_push_return_trace(old, self_addr, &trace.depth, + frame_pointer) == -EBUSY) { *parent = old; return; } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 30683883e0c..cc827ac9e8d 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -13,7 +13,6 @@ #include <asm/segment.h> #include <asm/page_types.h> #include <asm/pgtable_types.h> -#include <asm/desc.h> #include <asm/cache.h> #include <asm/thread_info.h> #include <asm/asm-offsets.h> @@ -262,9 +261,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); * which will be freed later */ -#ifndef CONFIG_HOTPLUG_CPU -.section .init.text,"ax",@progbits -#endif +__CPUINIT #ifdef CONFIG_SMP ENTRY(startup_32_smp) @@ -603,18 +600,11 @@ ignore_int: #endif iret -.section .cpuinit.data,"wa" + __REFDATA .align 4 ENTRY(initial_code) .long i386_start_kernel -.section .text -/* - * Real beginning of normal "text" segment - */ -ENTRY(stext) -ENTRY(_stext) - /* * BSS section */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 54b29bb24e7..fa54f78e2a0 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -12,7 +12,6 @@ #include <linux/linkage.h> #include <linux/threads.h> #include <linux/init.h> -#include <asm/desc.h> #include <asm/segment.h> #include <asm/pgtable.h> #include <asm/page.h> diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 81408b93f88..dedc2bddf7a 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -510,7 +510,8 @@ static int hpet_setup_irq(struct hpet_dev *dev) { if (request_irq(dev->irq, hpet_interrupt_handler, - IRQF_DISABLED|IRQF_NOBALANCING, dev->name, dev)) + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + dev->name, dev)) return -1; disable_irq(dev->irq); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index c2e0bb0890d..5cf36c053ac 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -7,6 +7,7 @@ #include <linux/spinlock.h> #include <linux/jiffies.h> #include <linux/module.h> +#include <linux/timex.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/io.h> diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index df3bf269bea..270ff83efc1 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -12,7 +12,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); /* * Initial thread structure. diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c3fe010d74c..b0cdde6932f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,8 @@ #include <asm/io_apic.h> #include <asm/irq.h> #include <asm/idle.h> +#include <asm/mce.h> +#include <asm/hw_irq.h> atomic_t irq_err_count; @@ -24,9 +26,9 @@ void (*generic_interrupt_extension)(void) = NULL; */ void ack_bad_irq(unsigned int irq) { - printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); + if (printk_ratelimit()) + pr_err("unexpected IRQ trap at vector %02x\n", irq); -#ifdef CONFIG_X86_LOCAL_APIC /* * Currently unexpected vectors happen only on SMP and APIC. * We _must_ ack these because every local APIC has only N @@ -36,9 +38,7 @@ void ack_bad_irq(unsigned int irq) * completely. * But only ack when the APIC is enabled -AK */ - if (cpu_has_apic) - ack_APIC_irq(); -#endif + ack_APIC_irq(); } #define irq_stats(x) (&per_cpu(irq_stat, x)) @@ -63,6 +63,14 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); + seq_printf(p, "%*s: ", prec, "CNT"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); + seq_printf(p, " Performance counter interrupts\n"); + seq_printf(p, "%*s: ", prec, "PND"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); + seq_printf(p, " Performance pending work\n"); #endif if (generic_interrupt_extension) { seq_printf(p, "%*s: ", prec, "PLT"); @@ -89,13 +97,23 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); -# ifdef CONFIG_X86_64 +# ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); # endif #endif +#ifdef CONFIG_X86_NEW_MCE + seq_printf(p, "%*s: ", prec, "MCE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); + seq_printf(p, " Machine check exceptions\n"); + seq_printf(p, "%*s: ", prec, "MCP"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); + seq_printf(p, " Machine check polls\n"); +#endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); @@ -166,6 +184,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; + sum += irq_stats(cpu)->apic_perf_irqs; + sum += irq_stats(cpu)->apic_pending_irqs; #endif if (generic_interrupt_extension) sum += irq_stats(cpu)->generic_irqs; @@ -176,9 +196,13 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #endif #ifdef CONFIG_X86_MCE sum += irq_stats(cpu)->irq_thermal_count; -# ifdef CONFIG_X86_64 +# ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; +# endif #endif +#ifdef CONFIG_X86_NEW_MCE + sum += per_cpu(mce_exception_count, cpu); + sum += per_cpu(mce_poll_count, cpu); #endif return sum; } @@ -213,14 +237,11 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) irq = __get_cpu_var(vector_irq)[vector]; if (!handle_irq(irq, regs)) { -#ifdef CONFIG_X86_64 - if (!disable_apic) - ack_APIC_irq(); -#endif + ack_APIC_irq(); if (printk_ratelimit()) - printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n", - __func__, smp_processor_id(), vector, irq); + pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", + __func__, smp_processor_id(), vector, irq); } irq_exit(); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit.c index 368b0a8836f..92b7703d3d5 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit.c @@ -1,20 +1,25 @@ +#include <linux/linkage.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/ioport.h> #include <linux/interrupt.h> +#include <linux/timex.h> #include <linux/slab.h> #include <linux/random.h> +#include <linux/kprobes.h> #include <linux/init.h> #include <linux/kernel_stat.h> #include <linux/sysdev.h> #include <linux/bitops.h> +#include <linux/acpi.h> #include <linux/io.h> #include <linux/delay.h> #include <asm/atomic.h> #include <asm/system.h> #include <asm/timer.h> +#include <asm/hw_irq.h> #include <asm/pgtable.h> #include <asm/desc.h> #include <asm/apic.h> @@ -22,7 +27,23 @@ #include <asm/i8259.h> #include <asm/traps.h> +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ +#ifdef CONFIG_X86_32 /* * Note that on a 486, we don't want to do a SIGFPE on an irq13 * as the irq is unreliable, and exception 16 works correctly @@ -52,30 +73,7 @@ static struct irqaction fpu_irq = { .handler = math_error_irq, .name = "fpu", }; - -void __init init_ISA_irqs(void) -{ - int i; - -#ifdef CONFIG_X86_LOCAL_APIC - init_bsp_APIC(); #endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = NULL; - desc->depth = 1; - - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} /* * IRQ2 is cascade interrupt to second interrupt controller @@ -118,29 +116,37 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) +static void __init init_ISA_irqs(void) { int i; - /* Execute any quirks before the call gates are initialised: */ - x86_quirk_pre_intr_init(); +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) + init_bsp_APIC(); +#endif + init_8259A(0); /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) + * 16 old-style INTA-cycle interrupts: */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); + for (i = 0; i < NR_IRQS_LEGACY; i++) { + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; + + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); } +} +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) +static void __init smp_intr_init(void) +{ +#ifdef CONFIG_SMP +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. @@ -160,16 +166,35 @@ void __init native_init_IRQ(void) /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - /* IPI for single call function */ + /* IPI for generic single function call */ alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); + call_function_single_interrupt); /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); + + /* IPI used for rebooting/stopping */ + alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt); #endif +#endif /* CONFIG_SMP */ +} + +static void __init apic_intr_init(void) +{ + smp_intr_init(); -#ifdef CONFIG_X86_LOCAL_APIC +#ifdef CONFIG_X86_THERMAL_VECTOR + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); +#endif +#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) + alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); +#endif + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -179,16 +204,59 @@ void __init native_init_IRQ(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + /* Performance monitoring interrupts: */ +# ifdef CONFIG_PERF_COUNTERS + alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); +# endif + #endif +} -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) - /* thermal monitor LVT interrupt */ - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +/** + * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors + * + * Description: + * Perform any necessary interrupt initialisation prior to setting up + * the "ordinary" interrupt call gates. For legacy reasons, the ISA + * interrupts should be initialised here if the machine emulates a PC + * in any way. + **/ +static void __init x86_quirk_pre_intr_init(void) +{ +#ifdef CONFIG_X86_32 + if (x86_quirks->arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) + return; + } #endif + init_ISA_irqs(); +} + +void __init native_init_IRQ(void) +{ + int i; + + /* Execute any quirks before the call gates are initialised: */ + x86_quirk_pre_intr_init(); + + apic_intr_init(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ + if (!test_bit(i, used_vectors)) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); + } if (!acpi_ioapic) setup_irq(2, &irq2); +#ifdef CONFIG_X86_32 /* * Call quirks after call gates are initialised (usually add in * the architecture specific gates): @@ -203,4 +271,5 @@ void __init native_init_IRQ(void) setup_irq(FPU_IRQ, &fpu_irq); irq_ctx_init(smp_processor_id()); +#endif } diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c deleted file mode 100644 index 8cd10537fd4..00000000000 --- a/arch/x86/kernel/irqinit_64.c +++ /dev/null @@ -1,177 +0,0 @@ -#include <linux/linkage.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/ioport.h> -#include <linux/interrupt.h> -#include <linux/timex.h> -#include <linux/slab.h> -#include <linux/random.h> -#include <linux/init.h> -#include <linux/kernel_stat.h> -#include <linux/sysdev.h> -#include <linux/bitops.h> -#include <linux/acpi.h> -#include <linux/io.h> -#include <linux/delay.h> - -#include <asm/atomic.h> -#include <asm/system.h> -#include <asm/hw_irq.h> -#include <asm/pgtable.h> -#include <asm/desc.h> -#include <asm/apic.h> -#include <asm/i8259.h> - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x30-0x3f) - */ - -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ - -static struct irqaction irq2 = { - .handler = no_action, - .name = "cascade", -}; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 -}; - -int vector_used_by_percpu_irq(unsigned int vector) -{ - int cpu; - - for_each_online_cpu(cpu) { - if (per_cpu(vector_irq, cpu)[vector] != -1) - return 1; - } - - return 0; -} - -static void __init init_ISA_irqs(void) -{ - int i; - - init_bsp_APIC(); - init_8259A(0); - - for (i = 0; i < NR_IRQS_LEGACY; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = NULL; - desc->depth = 1; - - /* - * 16 old-style INTA-cycle interrupts: - */ - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -static void __init smp_intr_init(void) -{ -#ifdef CONFIG_SMP - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for generic single function call */ - alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); - set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); -#endif -} - -static void __init apic_intr_init(void) -{ - smp_intr_init(); - - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); - - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* generic IPI for platform specific use */ - alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -} - -void __init native_init_IRQ(void) -{ - int i; - - init_ISA_irqs(); - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); - } - - apic_intr_init(); - - if (!acpi_ioapic) - setup_irq(2, &irq2); -} diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index b1f4dffb919..8d82a77a3f3 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -142,7 +142,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); gdb_regs32[GDB_CS] = __KERNEL_CS; gdb_regs32[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_PC] = p->thread.ip; + gdb_regs[GDB_PC] = 0; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 33019ddb56b..c664d515f61 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -27,6 +27,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/hardirq.h> +#include <asm/timer.h> #define MMU_QUEUE_SIZE 1024 @@ -195,11 +196,11 @@ static void kvm_leave_lazy_mmu(void) struct kvm_para_state *state = kvm_para_state(); mmu_queue_flush(state); - paravirt_leave_lazy(paravirt_get_lazy_mode()); + paravirt_leave_lazy_mmu(); state->mode = paravirt_get_lazy_mode(); } -static void paravirt_ops_setup(void) +static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; pv_info.paravirt_enabled = 1; @@ -230,6 +231,9 @@ static void paravirt_ops_setup(void) pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; } +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif } void __init kvm_guest_init(void) diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 846510b78a0..2a62d843f01 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -347,7 +347,7 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id) static struct irqaction mfgptirq = { .handler = mfgpt_tick, - .flags = IRQF_DISABLED | IRQF_NOBALANCING, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER, .name = "mfgpt-timer" }; diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 453b5795a5c..366baa17991 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -13,25 +13,13 @@ * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ -#include <linux/platform_device.h> -#include <linux/capability.h> -#include <linux/miscdevice.h> #include <linux/firmware.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> #include <linux/pci_ids.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/mutex.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/cpu.h> #include <linux/pci.h> -#include <linux/fs.h> -#include <linux/mm.h> #include <asm/microcode.h> #include <asm/processor.h> @@ -79,9 +67,6 @@ struct microcode_amd { #define UCODE_CONTAINER_SECTION_HDR 8 #define UCODE_CONTAINER_HEADER_SIZE 12 -/* serialize access to the physical write */ -static DEFINE_SPINLOCK(microcode_update_lock); - static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) @@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev) return 1; } -static void apply_microcode_amd(int cpu) +static int apply_microcode_amd(int cpu) { - unsigned long flags; u32 rev, dummy; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; @@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu) BUG_ON(cpu_num != cpu); if (mc_amd == NULL) - return; + return 0; - spin_lock_irqsave(µcode_update_lock, flags); wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); /* get patch id after patching */ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - spin_unlock_irqrestore(µcode_update_lock, flags); /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { printk(KERN_ERR "microcode: CPU%d: update failed " "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); - return; + return -1; } printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); uci->cpu_sig.rev = rev; + + return 0; } static int get_ucode_data(void *to, const u8 *from, size_t n) @@ -257,13 +241,12 @@ static int install_equiv_cpu_table(const u8 *buf) static void free_equiv_cpu_table(void) { - if (equiv_cpu_table) { - vfree(equiv_cpu_table); - equiv_cpu_table = NULL; - } + vfree(equiv_cpu_table); + equiv_cpu_table = NULL; } -static int generic_load_microcode(int cpu, const u8 *data, size_t size) +static enum ucode_state +generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; const u8 *ucode_ptr = data; @@ -272,12 +255,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) int new_rev = uci->cpu_sig.rev; unsigned int leftover; unsigned long offset; + enum ucode_state state = UCODE_OK; offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { printk(KERN_ERR "microcode: failed to create " "equivalent cpu table\n"); - return -EINVAL; + return UCODE_ERROR; } ucode_ptr += offset; @@ -293,8 +277,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) mc_header = (struct microcode_header_amd *)mc; if (get_matching_microcode(cpu, mc, new_rev)) { - if (new_mc) - vfree(new_mc); + vfree(new_mc); new_rev = mc_header->patch_id; new_mc = mc; } else @@ -306,34 +289,32 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) if (new_mc) { if (!leftover) { - if (uci->mc) - vfree(uci->mc); + vfree(uci->mc); uci->mc = new_mc; pr_debug("microcode: CPU%d found a matching microcode " "update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); - } else + } else { vfree(new_mc); - } + state = UCODE_ERROR; + } + } else + state = UCODE_NFOUND; free_equiv_cpu_table(); - return (int)leftover; + return state; } -static int request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; const struct firmware *firmware; - int ret; - - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); + enum ucode_state ret; - ret = request_firmware(&firmware, fw_name, device); - if (ret) { + if (request_firmware(&firmware, fw_name, device)) { printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); - return ret; + return UCODE_NFOUND; } ret = generic_load_microcode(cpu, firmware->data, firmware->size); @@ -343,11 +324,12 @@ static int request_microcode_fw(int cpu, struct device *device) return ret; } -static int request_microcode_user(int cpu, const void __user *buf, size_t size) +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) { printk(KERN_INFO "microcode: AMD microcode update via " "/dev/cpu/microcode not supported\n"); - return -1; + return UCODE_ERROR; } static void microcode_fini_cpu_amd(int cpu) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 98c470c069d..9371448290a 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -71,27 +71,18 @@ * Thanks to Stuart Swales for pointing out this bug. */ #include <linux/platform_device.h> -#include <linux/capability.h> #include <linux/miscdevice.h> -#include <linux/firmware.h> +#include <linux/capability.h> #include <linux/smp_lock.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> -#include <linux/uaccess.h> -#include <linux/vmalloc.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/slab.h> #include <linux/cpu.h> #include <linux/fs.h> #include <linux/mm.h> #include <asm/microcode.h> #include <asm/processor.h> -#include <asm/msr.h> MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); @@ -101,36 +92,110 @@ MODULE_LICENSE("GPL"); static struct microcode_ops *microcode_ops; -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - microcode_mutex to synchronize with each other; + * - get/put_online_cpus() to synchronize with + * the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */ static DEFINE_MUTEX(microcode_mutex); struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; EXPORT_SYMBOL_GPL(ucode_cpu_info); +/* + * Operations that are run on a target cpu: + */ + +struct cpu_info_ctx { + struct cpu_signature *cpu_sig; + int err; +}; + +static void collect_cpu_info_local(void *arg) +{ + struct cpu_info_ctx *ctx = arg; + + ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), + ctx->cpu_sig); +} + +static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) +{ + struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + +static int collect_cpu_info(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int ret; + + memset(uci, 0, sizeof(*uci)); + + ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); + if (!ret) + uci->valid = 1; + + return ret; +} + +struct apply_microcode_ctx { + int err; +}; + +static void apply_microcode_local(void *arg) +{ + struct apply_microcode_ctx *ctx = arg; + + ctx->err = microcode_ops->apply_microcode(smp_processor_id()); +} + +static int apply_microcode_on_target(int cpu) +{ + struct apply_microcode_ctx ctx = { .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + #ifdef CONFIG_MICROCODE_OLD_INTERFACE static int do_microcode_update(const void __user *buf, size_t size) { - cpumask_t old; int error = 0; int cpu; - old = current->cpus_allowed; - for_each_online_cpu(cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; if (!uci->valid) continue; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - error = microcode_ops->request_microcode_user(cpu, buf, size); - if (error < 0) - goto out; - if (!error) - microcode_ops->apply_microcode(cpu); + ustate = microcode_ops->request_microcode_user(cpu, buf, size); + if (ustate == UCODE_ERROR) { + error = -1; + break; + } else if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); } -out: - set_cpus_allowed_ptr(current, &old); + return error; } @@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2) static ssize_t microcode_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { - ssize_t ret; + ssize_t ret = -EINVAL; if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", - num_physpages); - return -EINVAL; + pr_err("microcode: too much data (max %ld pages)\n", num_physpages); + return ret; } get_online_cpus(); mutex_lock(µcode_mutex); - ret = do_microcode_update(buf, len); - if (!ret) + if (do_microcode_update(buf, len) == 0) ret = (ssize_t)len; mutex_unlock(µcode_mutex); @@ -165,15 +228,16 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, } static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, }; static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .fops = µcode_fops, + .minor = MICROCODE_MINOR, + .name = "microcode", + .devnode = "cpu/microcode", + .fops = µcode_fops, }; static int __init microcode_dev_init(void) @@ -182,9 +246,7 @@ static int __init microcode_dev_init(void) error = misc_register(µcode_dev); if (error) { - printk(KERN_ERR - "microcode: can't misc_register on minor=%d\n", - MICROCODE_MINOR); + pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); return error; } @@ -205,42 +267,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); /* fake device for request_firmware */ static struct platform_device *microcode_pdev; -static long reload_for_cpu(void *unused) +static int reload_for_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; mutex_lock(µcode_mutex); if (uci->valid) { - err = microcode_ops->request_microcode_fw(smp_processor_id(), - µcode_pdev->dev); - if (!err) - microcode_ops->apply_microcode(smp_processor_id()); + enum ucode_state ustate; + + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); + if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); + else + if (ustate == UCODE_ERROR) + err = -EINVAL; } mutex_unlock(µcode_mutex); + return err; } static ssize_t reload_store(struct sys_device *dev, struct sysdev_attribute *attr, - const char *buf, size_t sz) + const char *buf, size_t size) { - char *end; - unsigned long val = simple_strtoul(buf, &end, 0); - int err = 0; + unsigned long val; int cpu = dev->id; + int ret = 0; + char *end; + val = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; + if (val == 1) { get_online_cpus(); if (cpu_online(cpu)) - err = work_on_cpu(cpu, reload_for_cpu, NULL); + ret = reload_for_cpu(cpu); put_online_cpus(); } - if (err) - return err; - return sz; + + if (!ret) + ret = size; + + return ret; } static ssize_t version_show(struct sys_device *dev, @@ -271,11 +342,11 @@ static struct attribute *mc_default_attrs[] = { }; static struct attribute_group mc_attr_group = { - .attrs = mc_default_attrs, - .name = "microcode", + .attrs = mc_default_attrs, + .name = "microcode", }; -static void __microcode_fini_cpu(int cpu) +static void microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -283,103 +354,68 @@ static void __microcode_fini_cpu(int cpu) uci->valid = 0; } -static void microcode_fini_cpu(int cpu) -{ - mutex_lock(µcode_mutex); - __microcode_fini_cpu(cpu); - mutex_unlock(µcode_mutex); -} - -static void collect_cpu_info(int cpu) +static enum ucode_state microcode_resume_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - memset(uci, 0, sizeof(*uci)); - if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) - uci->valid = 1; + if (!uci->mc) + return UCODE_NFOUND; + + pr_debug("microcode: CPU%d updated upon resume\n", cpu); + apply_microcode_on_target(cpu); + + return UCODE_OK; } -static int microcode_resume_cpu(int cpu) +static enum ucode_state microcode_init_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct cpu_signature nsig; + enum ucode_state ustate; - pr_debug("microcode: CPU%d resumed\n", cpu); + if (collect_cpu_info(cpu)) + return UCODE_ERROR; - if (!uci->mc) - return 1; + /* --dimm. Trigger a delayed update? */ + if (system_state != SYSTEM_RUNNING) + return UCODE_NFOUND; - /* - * Let's verify that the 'cached' ucode does belong - * to this cpu (a bit of paranoia): - */ - if (microcode_ops->collect_cpu_info(cpu, &nsig)) { - __microcode_fini_cpu(cpu); - printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n", - cpu); - return -1; - } + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); - if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { - __microcode_fini_cpu(cpu); - printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", - cpu); - /* Should we look for a new ucode here? */ - return 1; + if (ustate == UCODE_OK) { + pr_debug("microcode: CPU%d updated upon init\n", cpu); + apply_microcode_on_target(cpu); } - return 0; + return ustate; } -static long microcode_update_cpu(void *unused) +static enum ucode_state microcode_update_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); - int err = 0; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; - /* - * Check if the system resume is in progress (uci->valid != NULL), - * otherwise just request a firmware: - */ - if (uci->valid) { - err = microcode_resume_cpu(smp_processor_id()); - } else { - collect_cpu_info(smp_processor_id()); - if (uci->valid && system_state == SYSTEM_RUNNING) - err = microcode_ops->request_microcode_fw( - smp_processor_id(), - µcode_pdev->dev); - } - if (!err) - microcode_ops->apply_microcode(smp_processor_id()); - return err; -} + if (uci->valid) + ustate = microcode_resume_cpu(cpu); + else + ustate = microcode_init_cpu(cpu); -static int microcode_init_cpu(int cpu) -{ - int err; - mutex_lock(µcode_mutex); - err = work_on_cpu(cpu, microcode_update_cpu, NULL); - mutex_unlock(µcode_mutex); - - return err; + return ustate; } static int mc_sysdev_add(struct sys_device *sys_dev) { int err, cpu = sys_dev->id; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!cpu_online(cpu)) return 0; pr_debug("microcode: CPU%d added\n", cpu); - memset(uci, 0, sizeof(*uci)); err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); if (err) return err; - err = microcode_init_cpu(cpu); + if (microcode_init_cpu(cpu) == UCODE_ERROR) + err = -EINVAL; return err; } @@ -400,19 +436,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) static int mc_sysdev_resume(struct sys_device *dev) { int cpu = dev->id; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!cpu_online(cpu)) return 0; - /* only CPU 0 will apply ucode here */ - microcode_update_cpu(NULL); + /* + * All non-bootup cpus are still disabled, + * so only CPU 0 will apply ucode here. + * + * Moreover, there can be no concurrent + * updates from any other places at this point. + */ + WARN_ON(cpu != 0); + + if (uci->valid && uci->mc) + microcode_ops->apply_microcode(cpu); + return 0; } static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, + .resume = mc_sysdev_resume, }; static __cpuinit int @@ -425,15 +472,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - if (microcode_init_cpu(cpu)) - printk(KERN_ERR "microcode: failed to init CPU%d\n", - cpu); + microcode_update_cpu(cpu); case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: pr_debug("microcode: CPU%d added\n", cpu); if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - printk(KERN_ERR "microcode: Failed to create the sysfs " - "group for CPU%d\n", cpu); + pr_err("microcode: Failed to create group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: @@ -465,13 +509,10 @@ static int __init microcode_init(void) microcode_ops = init_amd_microcode(); if (!microcode_ops) { - printk(KERN_ERR "microcode: no support for this CPU vendor\n"); + pr_err("microcode: no support for this CPU vendor\n"); return -ENODEV; } - error = microcode_dev_init(); - if (error) - return error; microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); if (IS_ERR(microcode_pdev)) { @@ -480,23 +521,31 @@ static int __init microcode_init(void) } get_online_cpus(); + mutex_lock(µcode_mutex); + error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + + mutex_unlock(µcode_mutex); put_online_cpus(); + if (error) { - microcode_dev_exit(); platform_device_unregister(microcode_pdev); return error; } + error = microcode_dev_init(); + if (error) + return error; + register_hotcpu_notifier(&mc_cpu_notifier); - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>," " Peter Oruba\n"); return 0; } +module_init(microcode_init); static void __exit microcode_exit(void) { @@ -505,16 +554,17 @@ static void __exit microcode_exit(void) unregister_hotcpu_notifier(&mc_cpu_notifier); get_online_cpus(); + mutex_lock(µcode_mutex); + sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + + mutex_unlock(µcode_mutex); put_online_cpus(); platform_device_unregister(microcode_pdev); microcode_ops = NULL; - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); } - -module_init(microcode_init); module_exit(microcode_exit); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 149b9ec7c1a..0d334ddd0a9 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -70,24 +70,11 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ -#include <linux/platform_device.h> -#include <linux/capability.h> -#include <linux/miscdevice.h> #include <linux/firmware.h> -#include <linux/smp_lock.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> #include <linux/uaccess.h> -#include <linux/vmalloc.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/mutex.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/cpu.h> -#include <linux/fs.h> -#include <linux/mm.h> +#include <linux/vmalloc.h> #include <asm/microcode.h> #include <asm/processor.h> @@ -150,13 +137,9 @@ struct extended_sigtable { #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) -/* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); - static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); - unsigned long flags; unsigned int val[2]; memset(csig, 0, sizeof(*csig)); @@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) csig->pf = 1 << ((val[1] >> 18) & 7); } - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); - wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ sync_core(); /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); - spin_unlock_irqrestore(µcode_update_lock, flags); - pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", - csig->sig, csig->pf, csig->rev); + printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", + cpu_num, csig->sig, csig->pf, csig->rev); return 0; } @@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev) return 0; } -static void apply_microcode(int cpu) +static int apply_microcode(int cpu) { struct microcode_intel *mc_intel; struct ucode_cpu_info *uci; - unsigned long flags; unsigned int val[2]; int cpu_num; @@ -334,10 +312,7 @@ static void apply_microcode(int cpu) BUG_ON(cpu_num != cpu); if (mc_intel == NULL) - return; - - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); + return 0; /* write microcode via MSR 0x79 */ wrmsr(MSR_IA32_UCODE_WRITE, @@ -351,30 +326,32 @@ static void apply_microcode(int cpu) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - spin_unlock_irqrestore(µcode_update_lock, flags); if (val[1] != mc_intel->hdr.rev) { - printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", - cpu_num, uci->cpu_sig.rev, val[1]); - return; + printk(KERN_ERR "microcode: CPU%d update " + "to revision 0x%x failed\n", + cpu_num, mc_intel->hdr.rev); + return -1; } - printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x, date = %04x-%02x-%02x \n", - cpu_num, uci->cpu_sig.rev, val[1], + printk(KERN_INFO "microcode: CPU%d updated to revision " + "0x%x, date = %04x-%02x-%02x \n", + cpu_num, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, (mc_intel->hdr.date >> 16) & 0xff); uci->cpu_sig.rev = val[1]; + + return 0; } -static int generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) +static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; u8 *ucode_ptr = data, *new_mc = NULL, *mc; int new_rev = uci->cpu_sig.rev; unsigned int leftover = size; + enum ucode_state state = UCODE_OK; while (leftover) { struct microcode_header_intel mc_header; @@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size, leftover -= mc_size; } - if (!new_mc) + if (leftover) { + if (new_mc) + vfree(new_mc); + state = UCODE_ERROR; goto out; + } - if (leftover) { - vfree(new_mc); + if (!new_mc) { + state = UCODE_NFOUND; goto out; } @@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size, pr_debug("microcode: CPU%d found a matching microcode update with" " version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); - - out: - return (int)leftover; +out: + return state; } static int get_ucode_fw(void *to, const void *from, size_t n) @@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n) return 0; } -static int request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device) { char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; - int ret; + enum ucode_state ret; - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); sprintf(name, "intel-ucode/%02x-%02x-%02x", c->x86, c->x86_model, c->x86_mask); - ret = request_firmware(&firmware, name, device); - if (ret) { + + if (request_firmware(&firmware, name, device)) { pr_debug("microcode: data file %s load failed\n", name); - return ret; + return UCODE_NFOUND; } ret = generic_load_microcode(cpu, (void *)firmware->data, @@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n) return copy_from_user(to, from, n); } -static int request_microcode_user(int cpu, const void __user *buf, size_t size) +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) { - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); } diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module.c index c23880b90b5..89f386f044e 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module.c @@ -1,6 +1,5 @@ -/* Kernel module help for x86-64 +/* Kernel module help for x86. Copyright (C) 2001 Rusty Russell. - Copyright (C) 2002,2003 Andi Kleen, SuSE Labs. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -22,23 +21,18 @@ #include <linux/fs.h> #include <linux/string.h> #include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/slab.h> #include <linux/bug.h> +#include <linux/mm.h> #include <asm/system.h> #include <asm/page.h> #include <asm/pgtable.h> +#if 0 +#define DEBUGP printk +#else #define DEBUGP(fmt...) - -#ifndef CONFIG_UML -void module_free(struct module *mod, void *module_region) -{ - vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ -} +#endif void *module_alloc(unsigned long size) { @@ -54,9 +48,15 @@ void *module_alloc(unsigned long size) if (!area) return NULL; - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); + return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, + PAGE_KERNEL_EXEC); +} + +/* Free memory returned from module_alloc */ +void module_free(struct module *mod, void *module_region) +{ + vfree(module_region); } -#endif /* We don't need anything special. */ int module_frob_arch_sections(Elf_Ehdr *hdr, @@ -67,6 +67,58 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, return 0; } +#ifdef CONFIG_X86_32 +int apply_relocate(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; + Elf32_Sym *sym; + uint32_t *location; + + DEBUGP("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf32_Sym *)sechdrs[symindex].sh_addr + + ELF32_R_SYM(rel[i].r_info); + + switch (ELF32_R_TYPE(rel[i].r_info)) { + case R_386_32: + /* We add the value into the location given */ + *location += sym->st_value; + break; + case R_386_PC32: + /* Add the value, subtract its postition */ + *location += sym->st_value - (uint32_t)location; + break; + default: + printk(KERN_ERR "module %s: Unknown relocation: %u\n", + me->name, ELF32_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; +} + +int apply_relocate_add(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", + me->name); + return -ENOEXEC; +} +#else /*X86_64*/ int apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex, @@ -147,6 +199,8 @@ int apply_relocate(Elf_Shdr *sechdrs, return -ENOSYS; } +#endif + int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c deleted file mode 100644 index 0edd819050e..00000000000 --- a/arch/x86/kernel/module_32.c +++ /dev/null @@ -1,152 +0,0 @@ -/* Kernel module help for i386. - Copyright (C) 2001 Rusty Russell. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ -#include <linux/moduleloader.h> -#include <linux/elf.h> -#include <linux/vmalloc.h> -#include <linux/fs.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/bug.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(fmt...) -#endif - -void *module_alloc(unsigned long size) -{ - if (size == 0) - return NULL; - return vmalloc_exec(size); -} - - -/* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) -{ - vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ -} - -/* We don't need anything special. */ -int module_frob_arch_sections(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - char *secstrings, - struct module *mod) -{ - return 0; -} - -int apply_relocate(Elf32_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - unsigned int i; - Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; - Elf32_Sym *sym; - uint32_t *location; - - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); - for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { - /* This is where to make the change */ - location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr - + rel[i].r_offset; - /* This is the symbol it is referring to. Note that all - undefined symbols have been resolved. */ - sym = (Elf32_Sym *)sechdrs[symindex].sh_addr - + ELF32_R_SYM(rel[i].r_info); - - switch (ELF32_R_TYPE(rel[i].r_info)) { - case R_386_32: - /* We add the value into the location given */ - *location += sym->st_value; - break; - case R_386_PC32: - /* Add the value, subtract its postition */ - *location += sym->st_value - (uint32_t)location; - break; - default: - printk(KERN_ERR "module %s: Unknown relocation: %u\n", - me->name, ELF32_R_TYPE(rel[i].r_info)); - return -ENOEXEC; - } - } - return 0; -} - -int apply_relocate_add(Elf32_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", - me->name); - return -ENOEXEC; -} - -int module_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL; - char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".text", secstrings + s->sh_name)) - text = s; - if (!strcmp(".altinstructions", secstrings + s->sh_name)) - alt = s; - if (!strcmp(".smp_locks", secstrings + s->sh_name)) - locks = s; - if (!strcmp(".parainstructions", secstrings + s->sh_name)) - para = s; - } - - if (alt) { - /* patch .altinstructions */ - void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); - } - if (locks && text) { - void *lseg = (void *)locks->sh_addr; - void *tseg = (void *)text->sh_addr; - alternatives_smp_module_add(me, me->name, - lseg, lseg + locks->sh_size, - tseg, tseg + text->sh_size); - } - - if (para) { - void *pseg = (void *)para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } - - return module_bug_finalize(hdr, sechdrs, me); -} - -void module_arch_cleanup(struct module *mod) -{ - alternatives_smp_module_del(mod); - module_bug_cleanup(mod); -} diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 70fd7e414c1..651c93b2886 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -17,6 +17,7 @@ #include <linux/acpi.h> #include <linux/module.h> #include <linux/smp.h> +#include <linux/pci.h> #include <asm/mtrr.h> #include <asm/mpspec.h> @@ -870,24 +871,17 @@ static inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} #endif /* CONFIG_X86_IO_APIC */ -static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, - int count) +static int +check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) { - if (!mpc_new_phys) { - pr_info("No spare slots, try to append...take your risk, " - "new mpc_length %x\n", count); - } else { - if (count <= mpc_new_length) - pr_info("No spare slots, try to append..., " - "new mpc_length %x\n", count); - else { - pr_err("mpc_new_length %lx is too small\n", - mpc_new_length); - return -1; - } + int ret = 0; + + if (!mpc_new_phys || count <= mpc_new_length) { + WARN(1, "update_mptable: No spare slots (length: %x)\n", count); + return -1; } - return 0; + return ret; } static int __init replace_intsrc_all(struct mpc_table *mpc, @@ -946,7 +940,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, } else { struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; count += sizeof(struct mpc_intsrc); - if (!check_slot(mpc_new_phys, mpc_new_length, count)) + if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) goto out; assign_to_mpc_intsrc(&mp_irqs[i], m); mpc->length = count; @@ -963,11 +957,14 @@ out: return 0; } -static int __initdata enable_update_mptable; +int enable_update_mptable; static int __init update_mptable_setup(char *str) { enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif return 0; } early_param("update_mptable", update_mptable_setup); @@ -980,6 +977,9 @@ static int __initdata alloc_mptable; static int __init parse_alloc_mptable_opt(char *p) { enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif alloc_mptable = 1; if (!p) return 0; diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 3cf3413ec62..98fd6cd4e3a 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -196,6 +196,11 @@ static struct notifier_block __refdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; +static char *msr_nodename(struct device *dev) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); +} + static int __init msr_init(void) { int i, err = 0; @@ -212,6 +217,7 @@ static int __init msr_init(void) err = PTR_ERR(msr_class); goto out_chrdev; } + msr_class->nodename = msr_nodename; for_each_online_cpu(i) { err = msr_device_create(i); if (err != 0) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 9faf43bea33..70ec9b951d7 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -248,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - BUG_ON(preemptible()); + BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - __get_cpu_var(paravirt_lazy_mode) = mode; + percpu_write(paravirt_lazy_mode, mode); } -void paravirt_leave_lazy(enum paravirt_lazy_mode mode) +static void leave_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); - BUG_ON(preemptible()); + BUG_ON(percpu_read(paravirt_lazy_mode) != mode); - __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; + percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); } void paravirt_enter_lazy_mmu(void) @@ -269,22 +267,36 @@ void paravirt_enter_lazy_mmu(void) void paravirt_leave_lazy_mmu(void) { - paravirt_leave_lazy(PARAVIRT_LAZY_MMU); + leave_lazy(PARAVIRT_LAZY_MMU); } -void paravirt_enter_lazy_cpu(void) +void paravirt_start_context_switch(struct task_struct *prev) { + BUG_ON(preemptible()); + + if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); + } enter_lazy(PARAVIRT_LAZY_CPU); } -void paravirt_leave_lazy_cpu(void) +void paravirt_end_context_switch(struct task_struct *next) { - paravirt_leave_lazy(PARAVIRT_LAZY_CPU); + BUG_ON(preemptible()); + + leave_lazy(PARAVIRT_LAZY_CPU); + + if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) + arch_enter_lazy_mmu_mode(); } enum paravirt_lazy_mode paravirt_get_lazy_mode(void) { - return __get_cpu_var(paravirt_lazy_mode); + if (in_interrupt()) + return PARAVIRT_LAZY_NONE; + + return percpu_read(paravirt_lazy_mode); } void arch_flush_lazy_mmu_mode(void) @@ -292,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void) preempt_disable(); if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { - WARN_ON(preempt_count() == 1); arch_leave_lazy_mmu_mode(); arch_enter_lazy_mmu_mode(); } @@ -300,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void) preempt_enable(); } -void arch_flush_lazy_cpu_mode(void) -{ - preempt_disable(); - - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { - WARN_ON(preempt_count() == 1); - arch_leave_lazy_cpu_mode(); - arch_enter_lazy_cpu_mode(); - } - - preempt_enable(); -} - struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, @@ -404,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = { .set_iopl_mask = native_set_iopl_mask, .io_delay = native_io_delay, - .lazy_mode = { - .enter = paravirt_nop, - .leave = paravirt_nop, - }, + .start_context_switch = paravirt_nop, + .end_context_switch = paravirt_nop, }; struct pv_apic_ops pv_apic_ops = { diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 755c21e906f..971a3bec47a 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = { static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; -/* enable this to stress test the chip's TCE cache */ -#ifdef CONFIG_IOMMU_DEBUG -static int debugging = 1; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, - int expected, unsigned long start, unsigned long end) -{ - unsigned long idx = start; - - BUG_ON(start >= end); - - while (idx < end) { - if (!!test_bit(idx, bitmap) != expected) - return idx; - ++idx; - } - - /* all bits have the expected value */ - return ~0UL; -} -#else /* debugging is disabled */ -static int debugging; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, - int expected, unsigned long start, unsigned long end) -{ - return ~0UL; -} - -#endif /* CONFIG_IOMMU_DEBUG */ - static inline int translation_enabled(struct iommu_table *tbl) { /* only PHBs with translation enabled have an IOMMU table */ @@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl, { unsigned long index; unsigned long end; - unsigned long badbit; unsigned long flags; index = start_addr >> PAGE_SHIFT; @@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl, spin_lock_irqsave(&tbl->it_lock, flags); - badbit = verify_bit_range(tbl->it_map, 0, index, end); - if (badbit != ~0UL) { - if (printk_ratelimit()) - printk(KERN_ERR "Calgary: entry already allocated at " - "0x%lx tbl %p dma 0x%lx npages %u\n", - badbit, tbl, start_addr, npages); - } - iommu_area_reserve(tbl->it_map, index, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { unsigned long entry; - unsigned long badbit; unsigned long badend; unsigned long flags; @@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, spin_lock_irqsave(&tbl->it_lock, flags); - badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); - if (badbit != ~0UL) { - if (printk_ratelimit()) - printk(KERN_ERR "Calgary: bit is off at 0x%lx " - "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", - badbit, tbl, dma_addr, entry, npages); - } - iommu_area_free(tbl->it_map, entry, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -1488,9 +1439,8 @@ void __init detect_calgary(void) iommu_detected = 1; calgary_detected = 1; printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); - printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " - "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, - debugging ? "enabled" : "disabled"); + printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", + specified_table_size); /* swiotlb for devices that aren't behind the Calgary. */ if (max_pfn > MAX_DMA32_PFN) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 745579bc825..1a041bcf506 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -32,6 +32,8 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; +int iommu_pass_through; + dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); @@ -210,6 +212,10 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "soft", 4)) swiotlb = 1; #endif + if (!strncmp(p, "pt", 2)) { + iommu_pass_through = 1; + return 1; + } gart_parse_options(p); @@ -290,6 +296,8 @@ static int __init pci_iommu_init(void) void pci_iommu_shutdown(void) { gart_iommu_shutdown(); + + amd_iommu_shutdown(); } /* Must execute after PCI subsystem */ fs_initcall(pci_iommu_init); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index b284b58c035..d2e56b8f48e 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -144,48 +144,21 @@ static void flush_gart(void) } #ifdef CONFIG_IOMMU_LEAK - -#define SET_LEAK(x) \ - do { \ - if (iommu_leak_tab) \ - iommu_leak_tab[x] = __builtin_return_address(0);\ - } while (0) - -#define CLEAR_LEAK(x) \ - do { \ - if (iommu_leak_tab) \ - iommu_leak_tab[x] = NULL; \ - } while (0) - /* Debugging aid for drivers that don't free their IOMMU tables */ -static void **iommu_leak_tab; static int leak_trace; static int iommu_leak_pages = 20; static void dump_leak(void) { - int i; static int dump; - if (dump || !iommu_leak_tab) + if (dump) return; dump = 1; - show_stack(NULL, NULL); - /* Very crude. dump some from the end of the table too */ - printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", - iommu_leak_pages); - for (i = 0; i < iommu_leak_pages; i += 2) { - printk(KERN_DEBUG "%lu: ", iommu_pages-i); - printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], - 0); - printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); - } - printk(KERN_DEBUG "\n"); + show_stack(NULL, NULL); + debug_dma_dump_mappings(NULL); } -#else -# define SET_LEAK(x) -# define CLEAR_LEAK(x) #endif static void iommu_full(struct device *dev, size_t size, int dir) @@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); - SET_LEAK(iommu_page + i); phys_mem += PAGE_SIZE; } return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); @@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; - CLEAR_LEAK(iommu_page + i); } free_iommu(iommu_page, npages); } @@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); - SET_LEAK(iommu_page); addr += PAGE_SIZE; iommu_page++; } @@ -688,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info) agp_gatt_table = gatt; - enable_gart_translations(); - error = sysdev_class_register(&gart_sysdev_class); if (!error) error = sysdev_register(&device_gart); @@ -707,7 +675,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) nommu: /* Should not happen anymore */ printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" - KERN_WARNING "falling back to iommu=soft.\n"); + "falling back to iommu=soft.\n"); return -1; } @@ -801,11 +769,12 @@ void __init gart_iommu_init(void) #ifdef CONFIG_IOMMU_LEAK if (leak_trace) { - iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, - get_order(iommu_pages*sizeof(void *))); - if (!iommu_leak_tab) + int ret; + + ret = dma_debug_resize_entries(iommu_pages); + if (ret) printk(KERN_DEBUG - "PCI-DMA: Cannot allocate leak trace area\n"); + "PCI-DMA: Cannot trace all the entries\n"); } #endif @@ -845,6 +814,14 @@ void __init gart_iommu_init(void) * the pages as Not-Present: */ wbinvd(); + + /* + * Now all caches are flushed and we can safely enable + * GART hardware. Doing it early leaves the possibility + * of stale cache entries that can lead to GART PTE + * errors. + */ + enable_gart_translations(); /* * Try to workaround a bug (thanks to BenH): diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 221a3853e26..6af96ee4420 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) return paddr; } -phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) +phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) { return baddr; } @@ -71,7 +71,8 @@ void __init pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 - if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) + if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || + iommu_pass_through) swiotlb = 1; #endif if (swiotlb_force) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca989158e84..071166a4ba8 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -8,12 +8,15 @@ #include <linux/module.h> #include <linux/pm.h> #include <linux/clockchips.h> +#include <linux/random.h> #include <trace/power.h> #include <asm/system.h> #include <asm/apic.h> +#include <asm/syscalls.h> #include <asm/idle.h> #include <asm/uaccess.h> #include <asm/i387.h> +#include <asm/ds.h> unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -45,6 +48,8 @@ void free_thread_xstate(struct task_struct *tsk) kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); tsk->thread.xstate = NULL; } + + WARN(tsk->thread.ds_ctx, "leaking DS context\n"); } void free_thread_info(struct thread_info *ti) @@ -58,7 +63,7 @@ void arch_task_cache_init(void) task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size, __alignof__(union thread_xstate), - SLAB_PANIC, NULL); + SLAB_PANIC | SLAB_NOTRACK, NULL); } /* @@ -83,8 +88,6 @@ void exit_thread(void) put_cpu(); kfree(bp); } - - ds_exit_thread(current); } void flush_thread(void) @@ -516,16 +519,12 @@ static void c1e_idle(void) if (!cpumask_test_cpu(cpu, c1e_mask)) { cpumask_set_cpu(cpu, c1e_mask); /* - * Force broadcast so ACPI can not interfere. Needs - * to run with interrupts enabled as it uses - * smp_function_call. + * Force broadcast so ACPI can not interfere. */ - local_irq_enable(); clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &cpu); printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", cpu); - local_irq_disable(); } clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); @@ -613,3 +612,16 @@ static int __init idle_setup(char *str) } early_param("idle", idle_setup); +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} + diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 76f8f84043a..59f4524984a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -9,8 +9,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include <stdarg.h> - #include <linux/stackprotector.h> #include <linux/cpu.h> #include <linux/errno.h> @@ -33,7 +31,6 @@ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/ptrace.h> -#include <linux/random.h> #include <linux/personality.h> #include <linux/tick.h> #include <linux/percpu.h> @@ -290,7 +287,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.io_bitmap_max = 0; } - ds_copy_thread(p, current); + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); p->thread.debugctlmsr = 0; @@ -407,7 +405,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_leave_lazy_cpu_mode(); + arch_end_context_switch(next_p); /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the @@ -497,15 +495,3 @@ unsigned long get_wchan(struct task_struct *p) return 0; } -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b751a41392b..ebefb5407b9 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -14,8 +14,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include <stdarg.h> - #include <linux/stackprotector.h> #include <linux/cpu.h> #include <linux/errno.h> @@ -32,7 +30,6 @@ #include <linux/delay.h> #include <linux/module.h> #include <linux/ptrace.h> -#include <linux/random.h> #include <linux/notifier.h> #include <linux/kprobes.h> #include <linux/kdebug.h> @@ -335,7 +332,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, goto out; } - ds_copy_thread(p, me); + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); p->thread.debugctlmsr = 0; @@ -428,7 +426,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_leave_lazy_cpu_mode(); + arch_end_context_switch(next_p); /* * Switch FS and GS. @@ -660,15 +658,3 @@ long sys_arch_prctl(int code, unsigned long addr) return do_arch_prctl(current, code, addr); } -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 23b7c8f017e..09ecbde91c1 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -21,6 +21,7 @@ #include <linux/audit.h> #include <linux/seccomp.h> #include <linux/signal.h> +#include <linux/workqueue.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -578,17 +579,130 @@ static int ioperm_get(struct task_struct *target, } #ifdef CONFIG_X86_PTRACE_BTS +/* + * A branch trace store context. + * + * Contexts may only be installed by ptrace_bts_config() and only for + * ptraced tasks. + * + * Contexts are destroyed when the tracee is detached from the tracer. + * The actual destruction work requires interrupts enabled, so the + * work is deferred and will be scheduled during __ptrace_unlink(). + * + * Contexts hold an additional task_struct reference on the traced + * task, as well as a reference on the tracer's mm. + * + * Ptrace already holds a task_struct for the duration of ptrace operations, + * but since destruction is deferred, it may be executed after both + * tracer and tracee exited. + */ +struct bts_context { + /* The branch trace handle. */ + struct bts_tracer *tracer; + + /* The buffer used to store the branch trace and its size. */ + void *buffer; + unsigned int size; + + /* The mm that paid for the above buffer. */ + struct mm_struct *mm; + + /* The task this context belongs to. */ + struct task_struct *task; + + /* The signal to send on a bts buffer overflow. */ + unsigned int bts_ovfl_signal; + + /* The work struct to destroy a context. */ + struct work_struct work; +}; + +static int alloc_bts_buffer(struct bts_context *context, unsigned int size) +{ + void *buffer = NULL; + int err = -ENOMEM; + + err = account_locked_memory(current->mm, current->signal->rlim, size); + if (err < 0) + return err; + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + goto out_refund; + + context->buffer = buffer; + context->size = size; + context->mm = get_task_mm(current); + + return 0; + + out_refund: + refund_locked_memory(current->mm, size); + return err; +} + +static inline void free_bts_buffer(struct bts_context *context) +{ + if (!context->buffer) + return; + + kfree(context->buffer); + context->buffer = NULL; + + refund_locked_memory(context->mm, context->size); + context->size = 0; + + mmput(context->mm); + context->mm = NULL; +} + +static void free_bts_context_work(struct work_struct *w) +{ + struct bts_context *context; + + context = container_of(w, struct bts_context, work); + + ds_release_bts(context->tracer); + put_task_struct(context->task); + free_bts_buffer(context); + kfree(context); +} + +static inline void free_bts_context(struct bts_context *context) +{ + INIT_WORK(&context->work, free_bts_context_work); + schedule_work(&context->work); +} + +static inline struct bts_context *alloc_bts_context(struct task_struct *task) +{ + struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL); + if (context) { + context->task = task; + task->bts = context; + + get_task_struct(task); + } + + return context; +} + static int ptrace_bts_read_record(struct task_struct *child, size_t index, struct bts_struct __user *out) { + struct bts_context *context; const struct bts_trace *trace; struct bts_struct bts; const unsigned char *at; int error; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; at = trace->ds.top - ((index + 1) * trace->ds.size); if ((void *)at < trace->ds.begin) @@ -597,7 +711,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index, if (!trace->read) return -EOPNOTSUPP; - error = trace->read(child->bts, at, &bts); + error = trace->read(context->tracer, at, &bts); if (error < 0) return error; @@ -611,13 +725,18 @@ static int ptrace_bts_drain(struct task_struct *child, long size, struct bts_struct __user *out) { + struct bts_context *context; const struct bts_trace *trace; const unsigned char *at; int error, drained = 0; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; if (!trace->read) return -EOPNOTSUPP; @@ -628,9 +747,8 @@ static int ptrace_bts_drain(struct task_struct *child, for (at = trace->ds.begin; (void *)at < trace->ds.top; out++, drained++, at += trace->ds.size) { struct bts_struct bts; - int error; - error = trace->read(child->bts, at, &bts); + error = trace->read(context->tracer, at, &bts); if (error < 0) return error; @@ -640,35 +758,18 @@ static int ptrace_bts_drain(struct task_struct *child, memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - error = ds_reset_bts(child->bts); + error = ds_reset_bts(context->tracer); if (error < 0) return error; return drained; } -static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size) -{ - child->bts_buffer = alloc_locked_buffer(size); - if (!child->bts_buffer) - return -ENOMEM; - - child->bts_size = size; - - return 0; -} - -static void ptrace_bts_free_buffer(struct task_struct *child) -{ - free_locked_buffer(child->bts_buffer, child->bts_size); - child->bts_buffer = NULL; - child->bts_size = 0; -} - static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) { + struct bts_context *context; struct ptrace_bts_config cfg; unsigned int flags = 0; @@ -678,28 +779,33 @@ static int ptrace_bts_config(struct task_struct *child, if (copy_from_user(&cfg, ucfg, sizeof(cfg))) return -EFAULT; - if (child->bts) { - ds_release_bts(child->bts); - child->bts = NULL; - } + context = child->bts; + if (!context) + context = alloc_bts_context(child); + if (!context) + return -ENOMEM; if (cfg.flags & PTRACE_BTS_O_SIGNAL) { if (!cfg.signal) return -EINVAL; - child->thread.bts_ovfl_signal = cfg.signal; return -EOPNOTSUPP; + context->bts_ovfl_signal = cfg.signal; } - if ((cfg.flags & PTRACE_BTS_O_ALLOC) && - (cfg.size != child->bts_size)) { - int error; + ds_release_bts(context->tracer); + context->tracer = NULL; - ptrace_bts_free_buffer(child); + if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) { + int err; - error = ptrace_bts_allocate_buffer(child, cfg.size); - if (error < 0) - return error; + free_bts_buffer(context); + if (!cfg.size) + return 0; + + err = alloc_bts_buffer(context, cfg.size); + if (err < 0) + return err; } if (cfg.flags & PTRACE_BTS_O_TRACE) @@ -708,15 +814,14 @@ static int ptrace_bts_config(struct task_struct *child, if (cfg.flags & PTRACE_BTS_O_SCHED) flags |= BTS_TIMESTAMPS; - child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size, - /* ovfl = */ NULL, /* th = */ (size_t)-1, - flags); - if (IS_ERR(child->bts)) { - int error = PTR_ERR(child->bts); - - ptrace_bts_free_buffer(child); - child->bts = NULL; + context->tracer = + ds_request_bts_task(child, context->buffer, context->size, + NULL, (size_t)-1, flags); + if (unlikely(IS_ERR(context->tracer))) { + int error = PTR_ERR(context->tracer); + free_bts_buffer(context); + context->tracer = NULL; return error; } @@ -727,20 +832,25 @@ static int ptrace_bts_status(struct task_struct *child, long cfg_size, struct ptrace_bts_config __user *ucfg) { + struct bts_context *context; const struct bts_trace *trace; struct ptrace_bts_config cfg; + context = child->bts; + if (!context) + return -ESRCH; + if (cfg_size < sizeof(cfg)) return -EIO; - trace = ds_read_bts(child->bts); + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; memset(&cfg, 0, sizeof(cfg)); - cfg.size = trace->ds.end - trace->ds.begin; - cfg.signal = child->thread.bts_ovfl_signal; - cfg.bts_size = sizeof(struct bts_struct); + cfg.size = trace->ds.end - trace->ds.begin; + cfg.signal = context->bts_ovfl_signal; + cfg.bts_size = sizeof(struct bts_struct); if (cfg.signal) cfg.flags |= PTRACE_BTS_O_SIGNAL; @@ -759,80 +869,51 @@ static int ptrace_bts_status(struct task_struct *child, static int ptrace_bts_clear(struct task_struct *child) { + struct bts_context *context; const struct bts_trace *trace; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - return ds_reset_bts(child->bts); + return ds_reset_bts(context->tracer); } static int ptrace_bts_size(struct task_struct *child) { + struct bts_context *context; const struct bts_trace *trace; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; return (trace->ds.top - trace->ds.begin) / trace->ds.size; } -static void ptrace_bts_fork(struct task_struct *tsk) -{ - tsk->bts = NULL; - tsk->bts_buffer = NULL; - tsk->bts_size = 0; - tsk->thread.bts_ovfl_signal = 0; -} - -static void ptrace_bts_untrace(struct task_struct *child) +/* + * Called from __ptrace_unlink() after the child has been moved back + * to its original parent. + */ +void ptrace_bts_untrace(struct task_struct *child) { if (unlikely(child->bts)) { - ds_release_bts(child->bts); + free_bts_context(child->bts); child->bts = NULL; - - /* We cannot update total_vm and locked_vm since - child's mm is already gone. But we can reclaim the - memory. */ - kfree(child->bts_buffer); - child->bts_buffer = NULL; - child->bts_size = 0; } } - -static void ptrace_bts_detach(struct task_struct *child) -{ - /* - * Ptrace_detach() races with ptrace_untrace() in case - * the child dies and is reaped by another thread. - * - * We only do the memory accounting at this point and - * leave the buffer deallocation and the bts tracer - * release to ptrace_bts_untrace() which will be called - * later on with tasklist_lock held. - */ - release_locked_buffer(child->bts_buffer, child->bts_size); -} -#else -static inline void ptrace_bts_fork(struct task_struct *tsk) {} -static inline void ptrace_bts_detach(struct task_struct *child) {} -static inline void ptrace_bts_untrace(struct task_struct *child) {} #endif /* CONFIG_X86_PTRACE_BTS */ -void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags) -{ - ptrace_bts_fork(child); -} - -void x86_ptrace_untrace(struct task_struct *child) -{ - ptrace_bts_untrace(child); -} - /* * Called by kernel/ptrace.c when detaching.. * @@ -844,7 +925,6 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif - ptrace_bts_detach(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 4f9c55f3a7c..03801f2f761 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -60,7 +60,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) "adc %5,%%edx ; " : "=A" (product), "=r" (tmp1), "=r" (tmp2) : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); -#elif __x86_64__ +#elif defined(__x86_64__) __asm__ ( "mul %%rdx ; shrd $32,%%rdx,%%rax" : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 7563b31b4f0..af71d06624b 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -491,5 +491,42 @@ void force_hpet_resume(void) break; } } +#endif + +#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) +/* Set correct numa_node information for AMD NB functions */ +static void __init quirk_amd_nb_node(struct pci_dev *dev) +{ + struct pci_dev *nb_ht; + unsigned int devfn; + u32 val; + + devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); + nb_ht = pci_get_slot(dev->bus, devfn); + if (!nb_ht) + return; + + pci_read_config_dword(nb_ht, 0x60, &val); + set_dev_node(&dev->dev, val & 7); + pci_dev_put(dev); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, + quirk_amd_nb_node); #endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 667188e0b5a..a06e8d10184 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -3,6 +3,7 @@ #include <linux/init.h> #include <linux/pm.h> #include <linux/efi.h> +#include <linux/dmi.h> #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> @@ -17,7 +18,6 @@ #include <asm/cpu.h> #ifdef CONFIG_X86_32 -# include <linux/dmi.h> # include <linux/ctype.h> # include <linux/mc146818rtc.h> #else @@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0KP561"), }, }, + { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 360", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"), + DMI_MATCH(DMI_BOARD_NAME, "0T656F"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", @@ -240,6 +249,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), }, }, + { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */ + .callback = set_bios_reboot, + .ident = "CompuLab SBC-FITPC2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"), + DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), + }, + }, { } }; @@ -387,6 +404,46 @@ EXPORT_SYMBOL(machine_real_restart); #endif /* CONFIG_X86_32 */ +/* + * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot + */ +static int __init set_pci_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_CF9) { + reboot_type = BOOT_CF9; + printk(KERN_INFO "%s series board detected. " + "Selecting PCI-method for reboots.\n", d->ident); + } + return 0; +} + +static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { + { /* Handle problems with rebooting on Apple MacBook5 */ + .callback = set_pci_reboot, + .ident = "Apple MacBook5", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), + }, + }, + { /* Handle problems with rebooting on Apple MacBookPro5 */ + .callback = set_pci_reboot, + .ident = "Apple MacBookPro5", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), + }, + }, + { } +}; + +static int __init pci_reboot_init(void) +{ + dmi_check_system(pci_reboot_dmi_table); + return 0; +} +core_initcall(pci_reboot_init); + static inline void kb_wait(void) { int i; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4158439bf6..63f32d220ef 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -112,6 +112,14 @@ #define ARCH_SETUP #endif +/* + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. + * The direct mapping extends to max_pfn_mapped, so that we can directly access + * apertures, ACPI and other tables without having to play with fixmaps. + */ +unsigned long max_low_pfn_mapped; +unsigned long max_pfn_mapped; + RESERVE_BRK(dmi_alloc, 65536); unsigned int boot_cpu_id __read_mostly; @@ -214,8 +222,8 @@ unsigned long mmu_cr4_features; unsigned long mmu_cr4_features = X86_CR4_PAE; #endif -/* Boot loader ID as an integer, for the benefit of proc_dointvec */ -int bootloader_type; +/* Boot loader ID and version as integers, for the benefit of proc_dointvec */ +int bootloader_type, bootloader_version; /* * Setup options @@ -281,6 +289,20 @@ void * __init extend_brk(size_t size, size_t align) return ret; } +#ifdef CONFIG_X86_64 +static void __init init_gbpages(void) +{ + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +} +#else +static inline void init_gbpages(void) +{ +} +#endif + static void __init reserve_brk(void) { if (_brk_end > _brk_start) @@ -293,15 +315,13 @@ static void __init reserve_brk(void) #ifdef CONFIG_BLK_DEV_INITRD -#ifdef CONFIG_X86_32 - #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; @@ -357,14 +377,13 @@ static void __init relocate_initrd(void) ramdisk_image, ramdisk_image + ramdisk_size - 1, ramdisk_here, ramdisk_here + ramdisk_size - 1); } -#endif static void __init reserve_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = ramdisk_image + ramdisk_size; - u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -394,14 +413,8 @@ static void __init reserve_initrd(void) return; } -#ifdef CONFIG_X86_32 relocate_initrd(); -#else - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08llx > 0x%08llx)\ndisabling initrd\n", - ramdisk_end, end_of_lowmem); - initrd_start = 0; -#endif + free_early(ramdisk_image, ramdisk_end); } #else @@ -659,6 +672,19 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), }, }, + { + /* + * AMI BIOS with low memory corruption was found on Intel DG45ID board. + * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will + * match only DMI_BOARD_NAME and see if there is more bad products + * with this vendor. + */ + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), + }, + }, #endif {} }; @@ -706,6 +732,12 @@ void __init setup_arch(char **cmdline_p) #endif saved_video_mode = boot_params.hdr.vid_mode; bootloader_type = boot_params.hdr.type_of_loader; + if ((bootloader_type >> 4) == 0xe) { + bootloader_type &= 0xf; + bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; + } + bootloader_version = bootloader_type & 0xf; + bootloader_version |= boot_params.hdr.ext_loader_ver << 4; #ifdef CONFIG_BLK_DEV_RAM rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; @@ -854,14 +886,20 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = max_pfn; high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; #endif #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION setup_bios_corruption_check(); #endif + printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", + max_pfn_mapped<<PAGE_SHIFT); + reserve_brk(); + init_gbpages(); + /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); max_pfn_mapped = max_low_pfn_mapped; @@ -997,24 +1035,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 /** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -void __init x86_quirk_pre_intr_init(void) -{ - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } - init_ISA_irqs(); -} - -/** * x86_quirk_intr_init - post gate setup interrupt initialisation * * Description: diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 8f0e13be36b..07d81916f21 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -124,7 +124,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } /* - * Remap allocator + * Large page remap allocator * * This allocator uses PMD page as unit. A PMD page is allocated for * each cpu and each is remapped into vmalloc area using PMD mapping. @@ -137,105 +137,185 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, * better than only using 4k mappings while still being NUMA friendly. */ #ifdef CONFIG_NEED_MULTIPLE_NODES -static size_t pcpur_size __initdata; -static void **pcpur_ptrs __initdata; +struct pcpul_ent { + unsigned int cpu; + void *ptr; +}; + +static size_t pcpul_size; +static struct pcpul_ent *pcpul_map; +static struct vm_struct pcpul_vm; -static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) +static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) { size_t off = (size_t)pageno << PAGE_SHIFT; - if (off >= pcpur_size) + if (off >= pcpul_size) return NULL; - return virt_to_page(pcpur_ptrs[cpu] + off); + return virt_to_page(pcpul_map[cpu].ptr + off); } -static ssize_t __init setup_pcpu_remap(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { - static struct vm_struct vm; - size_t ptrs_size, dyn_size; + size_t map_size, dyn_size; unsigned int cpu; + int i, j; ssize_t ret; - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, on non-NUMA, embedding is better. - * - * NOTE: disabled for now. - */ - if (true || !cpu_has_pse || !pcpu_need_numa()) + if (!chosen) { + size_t vm_size = VMALLOC_END - VMALLOC_START; + size_t tot_size = nr_cpu_ids * PMD_SIZE; + + /* on non-NUMA, embedding is better */ + if (!pcpu_need_numa()) + return -EINVAL; + + /* don't consume more than 20% of vmalloc area */ + if (tot_size > vm_size / 5) { + pr_info("PERCPU: too large chunk size %zuMB for " + "large page remap\n", tot_size >> 20); + return -EINVAL; + } + } + + /* need PSE */ + if (!cpu_has_pse) { + pr_warning("PERCPU: lpage allocator requires PSE\n"); return -EINVAL; + } /* * Currently supports only single page. Supporting multiple * pages won't be too difficult if it ever becomes necessary. */ - pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE); - if (pcpur_size > PMD_SIZE) { + if (pcpul_size > PMD_SIZE) { pr_warning("PERCPU: static data is larger than large page, " "can't use large page\n"); return -EINVAL; } - dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; + dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; /* allocate pointer array and alloc large pages */ - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); - pcpur_ptrs = alloc_bootmem(ptrs_size); + map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0])); + pcpul_map = alloc_bootmem(map_size); for_each_possible_cpu(cpu) { - pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpur_ptrs[cpu]) + pcpul_map[cpu].cpu = cpu; + pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, + PMD_SIZE); + if (!pcpul_map[cpu].ptr) { + pr_warning("PERCPU: failed to allocate large page " + "for cpu%u\n", cpu); goto enomem; + } /* - * Only use pcpur_size bytes and give back the rest. + * Only use pcpul_size bytes and give back the rest. * * Ingo: The 2MB up-rounding bootmem is needed to make * sure the partial 2MB page is still fully RAM - it's * not well-specified to have a PAT-incompatible area * (unmapped RAM, device memory, etc.) in that hole. */ - free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), - PMD_SIZE - pcpur_size); + free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), + PMD_SIZE - pcpul_size); - memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); + memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); } /* allocate address and map */ - vm.flags = VM_ALLOC; - vm.size = num_possible_cpus() * PMD_SIZE; - vm_area_register_early(&vm, PMD_SIZE); + pcpul_vm.flags = VM_ALLOC; + pcpul_vm.size = nr_cpu_ids * PMD_SIZE; + vm_area_register_early(&pcpul_vm, PMD_SIZE); for_each_possible_cpu(cpu) { - pmd_t *pmd; + pmd_t *pmd, pmd_v; - pmd = populate_extra_pmd((unsigned long)vm.addr - + cpu * PMD_SIZE); - set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), - PAGE_KERNEL_LARGE)); + pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + + cpu * PMD_SIZE); + pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), + PAGE_KERNEL_LARGE); + set_pmd(pmd, pmd_v); } /* we're ready, commit */ pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", vm.addr, static_size); + "%zu bytes\n", pcpul_vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, + ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, vm.addr, NULL); - goto out_free_ar; + PMD_SIZE, pcpul_vm.addr, NULL); + + /* sort pcpul_map array for pcpu_lpage_remapped() */ + for (i = 0; i < nr_cpu_ids - 1; i++) + for (j = i + 1; j < nr_cpu_ids; j++) + if (pcpul_map[i].ptr > pcpul_map[j].ptr) { + struct pcpul_ent tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + return ret; enomem: for_each_possible_cpu(cpu) - if (pcpur_ptrs[cpu]) - free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpur_ptrs), ptrs_size); - return ret; + if (pcpul_map[cpu].ptr) + free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); + free_bootmem(__pa(pcpul_map), map_size); + return -ENOMEM; +} + +/** + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area + * @kaddr: the kernel address in question + * + * Determine whether @kaddr falls in the pcpul recycled area. This is + * used by pageattr to detect VM aliases and break up the pcpu PMD + * mapping such that the same physical page is not mapped under + * different attributes. + * + * The recycled area is always at the tail of a partially used PMD + * page. + * + * RETURNS: + * Address of corresponding remapped pcpu address if match is found; + * otherwise, NULL. + */ +void *pcpu_lpage_remapped(void *kaddr) +{ + void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); + unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; + int left = 0, right = nr_cpu_ids - 1; + int pos; + + /* pcpul in use at all? */ + if (!pcpul_map) + return NULL; + + /* okay, perform binary search */ + while (left <= right) { + pos = (left + right) / 2; + + if (pcpul_map[pos].ptr < pmd_addr) + left = pos + 1; + else if (pcpul_map[pos].ptr > pmd_addr) + right = pos - 1; + else { + /* it shouldn't be in the area for the first chunk */ + WARN_ON(offset < pcpul_size); + + return pcpul_vm.addr + + pcpul_map[pos].cpu * PMD_SIZE + offset; + } + } + + return NULL; } #else -static ssize_t __init setup_pcpu_remap(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { return -EINVAL; } @@ -249,7 +329,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) * mapping so that it can use PMD mapping without additional TLB * pressure. */ -static ssize_t __init setup_pcpu_embed(size_t static_size) +static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -258,7 +338,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) * this. Also, embedding allocation doesn't play well with * NUMA. */ - if (!cpu_has_pse || pcpu_need_numa()) + if (!chosen && (!cpu_has_pse || pcpu_need_numa())) return -EINVAL; return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, @@ -297,7 +377,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) pcpu4k_nr_static_pages = PFN_UP(static_size); /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() + pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids * sizeof(pcpu4k_pages[0])); pcpu4k_pages = alloc_bootmem(pages_size); @@ -308,8 +388,11 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) void *ptr; ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) + if (!ptr) { + pr_warning("PERCPU: failed to allocate " + "4k page for cpu%u\n", cpu); goto enomem; + } memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); pcpu4k_pages[j++] = virt_to_page(ptr); @@ -333,6 +416,16 @@ out_free_ar: return ret; } +/* for explicit first chunk allocator selection */ +static char pcpu_chosen_alloc[16] __initdata; + +static int __init percpu_alloc_setup(char *str) +{ + strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); + return 0; +} +early_param("percpu_alloc", percpu_alloc_setup); + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -346,11 +439,6 @@ static inline void setup_percpu_segment(int cpu) #endif } -/* - * Great future plan: - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. - * Always point %gs to its beginning - */ void __init setup_per_cpu_areas(void) { size_t static_size = __per_cpu_end - __per_cpu_start; @@ -367,9 +455,26 @@ void __init setup_per_cpu_areas(void) * of large page mappings. Please read comments on top of * each allocator for details. */ - ret = setup_pcpu_remap(static_size); - if (ret < 0) - ret = setup_pcpu_embed(static_size); + ret = -EINVAL; + if (strlen(pcpu_chosen_alloc)) { + if (strcmp(pcpu_chosen_alloc, "4k")) { + if (!strcmp(pcpu_chosen_alloc, "lpage")) + ret = setup_pcpu_lpage(static_size, true); + else if (!strcmp(pcpu_chosen_alloc, "embed")) + ret = setup_pcpu_embed(static_size, true); + else + pr_warning("PERCPU: unknown allocator %s " + "specified\n", pcpu_chosen_alloc); + if (ret < 0) + pr_warning("PERCPU: %s allocator failed (%zd), " + "falling back to 4k\n", + pcpu_chosen_alloc, ret); + } + } else { + ret = setup_pcpu_lpage(static_size, false); + if (ret < 0) + ret = setup_pcpu_embed(static_size, false); + } if (ret < 0) ret = setup_pcpu_4k(static_size); if (ret < 0) @@ -425,6 +530,14 @@ void __init setup_per_cpu_areas(void) early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) + /* + * make sure boot cpu node_number is right, when boot cpu is on the + * node that doesn't have mem installed + */ + per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); +#endif + /* Setup node to cpumask map */ setup_node_to_cpumask_map(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 14425166b8e..4c578751e94 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,7 +6,6 @@ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ - #include <linux/sched.h> #include <linux/mm.h> #include <linux/smp.h> @@ -25,11 +24,11 @@ #include <asm/ucontext.h> #include <asm/i387.h> #include <asm/vdso.h> +#include <asm/mce.h> #ifdef CONFIG_X86_64 #include <asm/proto.h> #include <asm/ia32_unistd.h> -#include <asm/mce.h> #endif /* CONFIG_X86_64 */ #include <asm/syscall.h> @@ -857,10 +856,10 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) +#ifdef CONFIG_X86_NEW_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) - mce_notify_user(); + mce_notify_process(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ /* deal with pending signal delivery */ diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 13f33ea8cca..ec1de97600e 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -150,14 +150,40 @@ void native_send_call_func_ipi(const struct cpumask *mask) * this function calls the 'stop' function on all other CPUs in the system. */ +asmlinkage void smp_reboot_interrupt(void) +{ + ack_APIC_irq(); + irq_enter(); + stop_this_cpu(NULL); + irq_exit(); +} + static void native_smp_send_stop(void) { unsigned long flags; + unsigned long wait; if (reboot_force) return; - smp_call_function(stop_this_cpu, NULL, 0); + /* + * Use an own vector here because smp_call_function + * does lots of things not suitable in a panic situation. + * On most systems we could also use an NMI here, + * but there are a few systems around where NMI + * is problematic so stay with an non NMI for now + * (this implies we cannot stop CPUs spinning with irq off + * currently) + */ + if (num_online_cpus() > 1) { + apic->send_IPI_allbutself(REBOOT_VECTOR); + + /* Don't wait longer than a second */ + wait = USEC_PER_SEC; + while (num_online_cpus() > 1 && wait--) + udelay(1); + } + local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); @@ -172,6 +198,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + /* + * KVM uses this interrupt to force a cpu out of guest mode + */ } void smp_call_function_interrupt(struct pt_regs *regs) @@ -193,19 +222,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) } struct smp_ops smp_ops = { - .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, - .smp_prepare_cpus = native_smp_prepare_cpus, - .smp_cpus_done = native_smp_cpus_done, + .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, + .smp_prepare_cpus = native_smp_prepare_cpus, + .smp_cpus_done = native_smp_cpus_done, - .smp_send_stop = native_smp_send_stop, - .smp_send_reschedule = native_smp_send_reschedule, + .smp_send_stop = native_smp_send_stop, + .smp_send_reschedule = native_smp_send_reschedule, - .cpu_up = native_cpu_up, - .cpu_die = native_cpu_die, - .cpu_disable = native_cpu_disable, - .play_dead = native_play_dead, + .cpu_up = native_cpu_up, + .cpu_die = native_cpu_die, + .cpu_disable = native_cpu_disable, + .play_dead = native_play_dead, - .send_call_func_ipi = native_send_call_func_ipi, + .send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, }; EXPORT_SYMBOL_GPL(smp_ops); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 58d24ef917d..2fecda69ee6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -504,7 +504,7 @@ void __inquire_remote_apic(int apicid) * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this * won't ... remember to clear down the APIC, etc later. */ -int __devinit +int __cpuinit wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; @@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) return (send_status | accept_status); } -int __devinit +static int __cpuinit wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; @@ -822,10 +822,12 @@ do_rest: /* mark "stuck" area as not stuck */ *((volatile unsigned long *)trampoline_base) = 0; - /* - * Cleanup possible dangling ends... - */ - smpboot_restore_warm_reset_vector(); + if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { + /* + * Cleanup possible dangling ends... + */ + smpboot_restore_warm_reset_vector(); + } return boot_error; } @@ -871,7 +873,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) err = do_boot_cpu(apicid, cpu); - zap_low_mappings(); + zap_low_mappings(false); low_mappings = 0; #else err = do_boot_cpu(apicid, cpu); @@ -990,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus) */ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - printk(KERN_ERR "... forcing use of dummy APIC emulation." + if (!disable_apic) { + pr_err("BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + pr_err("... forcing use of dummy APIC emulation." "(tell your hw vendor)\n"); + } smpboot_clear_io_apic(); arch_disable_smp_support(); return -1; diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index f7bddc2e37d..c3eb207181f 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol) static int save_stack_stack(void *data, char *name) { - return -1; + return 0; } static void save_stack_address(void *data, unsigned long addr, int reliable) @@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace) } EXPORT_SYMBOL_GPL(save_stack_trace); +void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) +{ + dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} + void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index ff5c8736b49..d51321ddafd 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -334,3 +334,5 @@ ENTRY(sys_call_table) .long sys_inotify_init1 .long sys_preadv .long sys_pwritev + .long sys_rt_tgsigqueueinfo /* 335 */ + .long sys_perf_counter_open diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 8c7b03b0cfc..77b9689f8ed 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -711,25 +711,31 @@ uv_activation_descriptor_init(int node, int pnode) unsigned long pa; unsigned long m; unsigned long n; - unsigned long mmr_image; struct bau_desc *adp; struct bau_desc *ad2; - adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node); + /* + * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) + * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade + */ + adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* + UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); BUG_ON(!adp); pa = uv_gpa(adp); /* need the real nasid*/ n = pa >> uv_nshift; m = pa & uv_mmask; - mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE); - if (mmr_image) { - uv_write_global_mmr64(pnode, (unsigned long) - UVH_LB_BAU_SB_DESCRIPTOR_BASE, - (n << UV_DESC_BASE_PNODE_SHIFT | m)); - } + uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, + (n << UV_DESC_BASE_PNODE_SHIFT | m)); - for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { + /* + * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each + * cpu even though we only use the first one; one descriptor can + * describe a broadcast to 256 nodes. + */ + for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); + i++, ad2++) { memset(ad2, 0, sizeof(struct bau_desc)); ad2->header.sw_ack_flag = 1; /* @@ -738,6 +744,7 @@ uv_activation_descriptor_init(int node, int pnode) * note that base_dest_nodeid is actually a nasid. */ ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; + ad2->header.dest_subnodeid = 0x10; /* the LB */ ad2->header.command = UV_NET_ENDPOINT_INTD; ad2->header.int_both = 1; /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff..5204332f475 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -45,6 +45,7 @@ #include <linux/edac.h> #endif +#include <asm/kmemcheck.h> #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> @@ -53,6 +54,7 @@ #include <asm/traps.h> #include <asm/desc.h> #include <asm/i387.h> +#include <asm/mce.h> #include <asm/mach_traps.h> @@ -64,8 +66,6 @@ #include <asm/setup.h> #include <asm/traps.h> -#include "cpu/mcheck/mce.h" - asmlinkage int system_call(void); /* Do we ignore FPU interrupts ? */ @@ -346,6 +346,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); + if (panic_on_io_nmi) + panic("NMI IOCK error: Not continuing"); + /* Re-enable the IOCK line, wait for a few seconds */ reason = (reason & 0xf) | 8; outb(reason, 0x61); @@ -534,6 +537,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) get_debugreg(condition, 6); + /* Catch kmemcheck conditions first of all! */ + if (condition & DR_STEP && kmemcheck_trap(regs)) + return; + /* * The processor cleared BTF, so don't mark that we need it set. */ @@ -798,15 +805,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) return new_kesp; } -#else +#endif + asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { } -asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } -#endif /* * 'math_state_restore()' saves the current math information in the @@ -839,9 +846,6 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ -#ifdef CONFIG_X86_32 - restore_fpu(tsk); -#else /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ @@ -850,7 +854,7 @@ asmlinkage void math_state_restore(void) force_sig(SIGSEGV, tsk); return; } -#endif + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; } @@ -945,8 +949,13 @@ void __init trap_init(void) #endif set_intr_gate(19, &simd_coprocessor_error); + /* Reserve all the builtin and the syscall vector: */ + for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) + set_bit(i, used_vectors); + #ifdef CONFIG_IA32_EMULATION set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); + set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif #ifdef CONFIG_X86_32 @@ -963,17 +972,9 @@ void __init trap_init(void) } set_system_trap_gate(SYSCALL_VECTOR, &system_call); -#endif - - /* Reserve all the builtin and the syscall vector: */ - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) - set_bit(i, used_vectors); - -#ifdef CONFIG_X86_64 - set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#else set_bit(SYSCALL_VECTOR, used_vectors); #endif + /* * Should be a barrier for any external CPU state: */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index d57de05dc43..71f4368b357 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -9,6 +9,7 @@ #include <linux/delay.h> #include <linux/clocksource.h> #include <linux/percpu.h> +#include <linux/timex.h> #include <asm/hpet.h> #include <asm/timer.h> @@ -274,15 +275,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) * use the TSC value at the transitions to calculate a pretty * good value for the TSC frequencty. */ +static inline int pit_verify_msb(unsigned char val) +{ + /* Ignore LSB */ + inb(0x42); + return inb(0x42) == val; +} + static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { int count; u64 tsc = 0; for (count = 0; count < 50000; count++) { - /* Ignore LSB */ - inb(0x42); - if (inb(0x42) != val) + if (!pit_verify_msb(val)) break; tsc = get_cycles(); } @@ -335,8 +341,7 @@ static unsigned long quick_pit_calibrate(void) * to do that is to just read back the 16-bit counter * once from the PIT. */ - inb(0x42); - inb(0x42); + pit_verify_msb(0); if (pit_expect_msb(0xff, &tsc, &d1)) { for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { @@ -347,8 +352,19 @@ static unsigned long quick_pit_calibrate(void) * Iterate until the error is less than 500 ppm */ delta -= tsc; - if (d1+d2 < delta >> 11) - goto success; + if (d1+d2 >= delta >> 11) + continue; + + /* + * Check the PIT one more time to verify that + * all TSC reads were stable wrt the PIT. + * + * This also guarantees serialization of the + * last cycle read ('d2') in pit_expect_msb. + */ + if (!pit_verify_msb(0xfe - i)) + break; + goto success; } } printk("Fast TSC calibration failed\n"); @@ -384,13 +400,13 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate, tsc_khz; + unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; int hpet = is_hpet_enabled(), i, loopmin; - tsc_khz = get_hypervisor_tsc_freq(); - if (tsc_khz) { + hv_tsc_khz = get_hypervisor_tsc_freq(); + if (hv_tsc_khz) { printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); - return tsc_khz; + return hv_tsc_khz; } local_irq_save(flags); @@ -589,22 +605,26 @@ EXPORT_SYMBOL(recalibrate_cpu_khz); */ DEFINE_PER_CPU(unsigned long, cyc2ns); +DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { - unsigned long long tsc_now, ns_now; + unsigned long long tsc_now, ns_now, *offset; unsigned long flags, *scale; local_irq_save(flags); sched_clock_idle_sleep_event(); scale = &per_cpu(cyc2ns, cpu); + offset = &per_cpu(cyc2ns_offset, cpu); rdtscll(tsc_now); ns_now = __cycles_2_ns(tsc_now); - if (cpu_khz) + if (cpu_khz) { *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); + } sched_clock_idle_wakeup_event(0); local_irq_restore(flags); @@ -631,17 +651,15 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; - unsigned long *lpj, dummy; + unsigned long *lpj; if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) return 0; - lpj = &dummy; - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + lpj = &boot_cpu_data.loops_per_jiffy; #ifdef CONFIG_SMP + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) lpj = &cpu_data(freq->cpu).loops_per_jiffy; -#else - lpj = &boot_cpu_data.loops_per_jiffy; #endif if (!ref_freq) { @@ -710,7 +728,16 @@ static cycle_t read_tsc(struct clocksource *cs) #ifdef CONFIG_X86_64 static cycle_t __vsyscall_fn vread_tsc(void) { - cycle_t ret = (cycle_t)vget_cycles(); + cycle_t ret; + + /* + * Surround the RDTSC by barriers, to make sure it's not + * speculated to outside the seqlock critical section and + * does not cause time warps: + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + rdtsc_barrier(); return ret >= __vsyscall_gtod_data.clock.cycle_last ? ret : __vsyscall_gtod_data.clock.cycle_last; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index bf36328f6ef..027b5b49899 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count; * of a critical section, to be able to prove TSC time-warps: */ static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; + static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; static __cpuinitdata int nr_warps; @@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu) return; if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - printk(KERN_INFO - "Skipping synchronization checks as TSC is reliable.\n"); + pr_info("Skipping synchronization checks as TSC is reliable.\n"); return; } - printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", - smp_processor_id(), cpu); + pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:", + smp_processor_id(), cpu); /* * Reset it - in case this is a second bootup: @@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu) if (nr_warps) { printk("\n"); - printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," - " turning off TSC clock.\n", max_warp); + pr_warning("Measured %Ld cycles TSC warp between CPUs, " + "turning off TSC clock.\n", max_warp); mark_tsc_unstable("check_tsc_sync_source failed"); } else { printk(" passed.\n"); @@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void) while (atomic_read(&stop_count) != cpus) cpu_relax(); } -#undef NR_LOOPS - diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index d7ac84e7fc1..9c4e6253905 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk info->regs.pt.ds = 0; info->regs.pt.es = 0; info->regs.pt.fs = 0; - -/* we are clearing gs later just before "jmp resume_userspace", - * because it is not saved/restored. - */ +#ifndef CONFIG_X86_32_LAZY_GS + info->regs.pt.gs = 0; +#endif /* * The flags register is also special: we cannot trust that the user @@ -318,9 +317,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk } /* - * Save old state, set default return value (%ax) to 0 + * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL) */ - info->regs32->ax = 0; + info->regs32->ax = VM86_SIGNAL; tsk->thread.saved_sp0 = tsk->thread.sp0; tsk->thread.saved_fs = info->regs32->fs; tsk->thread.saved_gs = get_user_gs(info->regs32); @@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk __asm__ __volatile__( "movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" +#ifdef CONFIG_X86_32_LAZY_GS "mov %2, %%gs\n\t" +#endif "jmp resume_userspace" : /* no outputs */ :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 95deb9f2211..95a7289e4b0 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -441,7 +441,7 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, ap.ds = __USER_DS; ap.es = __USER_DS; ap.fs = __KERNEL_PERCPU; - ap.gs = 0; + ap.gs = __KERNEL_STACK_CANARY; ap.eflags = 0; @@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, } #endif -static void vmi_enter_lazy_cpu(void) +static void vmi_start_context_switch(struct task_struct *prev) { - paravirt_enter_lazy_cpu(); + paravirt_start_context_switch(prev); vmi_ops.set_lazy_mode(2); } +static void vmi_end_context_switch(struct task_struct *next) +{ + vmi_ops.set_lazy_mode(0); + paravirt_end_context_switch(next); +} + static void vmi_enter_lazy_mmu(void) { paravirt_enter_lazy_mmu(); vmi_ops.set_lazy_mode(1); } -static void vmi_leave_lazy(void) +static void vmi_leave_lazy_mmu(void) { - paravirt_leave_lazy(paravirt_get_lazy_mode()); vmi_ops.set_lazy_mode(0); + paravirt_leave_lazy_mmu(); } static inline int __init check_vmi_rom(struct vrom_header *rom) @@ -711,14 +717,14 @@ static inline int __init activate_vmi(void) para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); para_fill(pv_cpu_ops.io_delay, IODelay); - para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, + para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch, set_lazy_mode, SetLazyMode); - para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, + para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch, set_lazy_mode, SetLazyMode); para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, set_lazy_mode, SetLazyMode); - para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, + para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu, set_lazy_mode, SetLazyMode); /* user and kernel flush are just handled with different flags to FlushTLB */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 849ee611f01..9fc178255c0 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -1,5 +1,394 @@ +/* + * ld script for the x86 kernel + * + * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz> + * + * Modernisation, unification and other changes and fixes: + * Copyright (C) 2007-2009 Sam Ravnborg <sam@ravnborg.org> + * + * + * Don't define absolute symbols until and unless you know that symbol + * value is should remain constant even if kernel image is relocated + * at run time. Absolute symbols are not relocated. If symbol value should + * change if kernel is relocated, make the symbol section relative and + * put it inside the section definition. + */ + #ifdef CONFIG_X86_32 -# include "vmlinux_32.lds.S" +#define LOAD_OFFSET __PAGE_OFFSET #else -# include "vmlinux_64.lds.S" +#define LOAD_OFFSET __START_KERNEL_map #endif + +#include <asm-generic/vmlinux.lds.h> +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> +#include <asm/page_types.h> +#include <asm/cache.h> +#include <asm/boot.h> + +#undef i386 /* in case the preprocessor is a 32bit one */ + +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) + +#ifdef CONFIG_X86_32 +OUTPUT_ARCH(i386) +ENTRY(phys_startup_32) +jiffies = jiffies_64; +#else +OUTPUT_ARCH(i386:x86-64) +ENTRY(phys_startup_64) +jiffies_64 = jiffies; +#endif + +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ +#ifdef CONFIG_X86_64 + user PT_LOAD FLAGS(7); /* RWE */ +#ifdef CONFIG_SMP + percpu PT_LOAD FLAGS(7); /* RWE */ +#endif + init PT_LOAD FLAGS(7); /* RWE */ +#endif + note PT_NOTE FLAGS(0); /* ___ */ +} + +SECTIONS +{ +#ifdef CONFIG_X86_32 + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; + phys_startup_32 = startup_32 - LOAD_OFFSET; +#else + . = __START_KERNEL; + phys_startup_64 = startup_64 - LOAD_OFFSET; +#endif + + /* Text and read-only data */ + + /* bootstrapping code */ + .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { + _text = .; + *(.text.head) + } :text = 0x9090 + + /* The rest of the text */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { +#ifdef CONFIG_X86_32 + /* not really needed, already page aligned */ + . = ALIGN(PAGE_SIZE); + *(.text.page_aligned) +#endif + . = ALIGN(8); + _stext = .; + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + /* End of text section */ + _etext = .; + } :text = 0x9090 + + NOTES :text :note + + /* Exception table */ + . = ALIGN(16); + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } :text = 0x9090 + + RO_DATA(PAGE_SIZE) + + /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { + /* Start of data section */ + _sdata = .; + + /* init_task */ + INIT_TASK_DATA(THREAD_SIZE) + +#ifdef CONFIG_X86_32 + /* 32 bit has nosave before _edata */ + NOSAVE_DATA +#endif + + PAGE_ALIGNED_DATA(PAGE_SIZE) + *(.data.idt) + + CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) + + DATA_DATA + CONSTRUCTORS + + /* rarely changed data like cpu maps */ + READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) + + /* End of data section */ + _edata = .; + } :data + +#ifdef CONFIG_X86_64 + +#define VSYSCALL_ADDR (-10*1024*1024) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \ + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \ + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) + +#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) + +#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + + . = VSYSCALL_ADDR; + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { + *(.vsyscall_0) + } :user + + __vsyscall_0 = VSYSCALL_VIRT_ADDR; + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { + *(.vsyscall_fn) + } + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { + *(.vsyscall_gtod_data) + } + + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { + *(.vsyscall_clock) + } + vsyscall_clock = VVIRT(.vsyscall_clock); + + + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { + *(.vsyscall_1) + } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { + *(.vsyscall_2) + } + + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { + *(.vgetcpu_mode) + } + vgetcpu_mode = VVIRT(.vgetcpu_mode); + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .jiffies : AT(VLOAD(.jiffies)) { + *(.jiffies) + } + jiffies = VVIRT(.jiffies); + + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { + *(.vsyscall_3) + } + + . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + +#undef VSYSCALL_ADDR +#undef VSYSCALL_PHYS_ADDR +#undef VSYSCALL_VIRT_ADDR +#undef VLOAD_OFFSET +#undef VLOAD +#undef VVIRT_OFFSET +#undef VVIRT + +#endif /* CONFIG_X86_64 */ + + /* Init code and data - will be freed after init */ + . = ALIGN(PAGE_SIZE); + .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { + __init_begin = .; /* paired with __init_end */ + } + +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) + /* + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - .init.text - should + * start another segment - init. + */ + PERCPU_VADDR(0, :percpu) +#endif + + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + _sinittext = .; + INIT_TEXT + _einittext = .; + } +#ifdef CONFIG_X86_64 + :init +#endif + + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + INIT_DATA + } + + . = ALIGN(16); + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + __setup_start = .; + *(.init.setup) + __setup_end = .; + } + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } + + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; + } + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + + SECURITY_INIT + + . = ALIGN(8); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + + . = ALIGN(8); + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + + /* + * .exit.text is discard at runtime, not link time, to deal with + * references from .altinstructions and .eh_frame + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } + +#ifdef CONFIG_BLK_DEV_INITRD + . = ALIGN(PAGE_SIZE); + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } +#endif + +#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) + PERCPU(PAGE_SIZE) +#endif + + . = ALIGN(PAGE_SIZE); + + /* freed after init ends here */ + .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) { + __init_end = .; + } + + /* + * smp_locks might be freed after init + * start/end must be page aligned + */ + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + } + +#ifdef CONFIG_X86_64 + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + NOSAVE_DATA + } +#endif + + /* BSS */ + . = ALIGN(PAGE_SIZE); + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + __bss_start = .; + *(.bss.page_aligned) + *(.bss) + . = ALIGN(4); + __bss_stop = .; + } + + . = ALIGN(PAGE_SIZE); + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ + __brk_limit = .; + } + + .end : AT(ADDR(.end) - LOAD_OFFSET) { + _end = .; + } + + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + *(.eh_frame) + *(.discard) + } + + STABS_DEBUG + DWARF_DEBUG +} + + +#ifdef CONFIG_X86_32 +. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); +#else +/* + * Per-cpu symbols which need to be offset from __per_cpu_load + * for the boot processor. + */ +#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load +INIT_PER_CPU(gdt_page); +INIT_PER_CPU(irq_stack_union); + +/* + * Build-time check on the image size: + */ +. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); + +#ifdef CONFIG_SMP +. = ASSERT((per_cpu__irq_stack_union == 0), + "irq_stack_union is not at start of per-cpu area"); +#endif + +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_KEXEC +#include <asm/kexec.h> + +. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big"); +#endif + diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S deleted file mode 100644 index 62ad500d55f..00000000000 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ /dev/null @@ -1,229 +0,0 @@ -/* ld script to make i386 Linux kernel - * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; - * - * Don't define absolute symbols until and unless you know that symbol - * value is should remain constant even if kernel image is relocated - * at run time. Absolute symbols are not relocated. If symbol value should - * change if kernel is relocated, make the symbol section relative and - * put it inside the section definition. - */ - -#define LOAD_OFFSET __PAGE_OFFSET - -#include <asm-generic/vmlinux.lds.h> -#include <asm/thread_info.h> -#include <asm/page_types.h> -#include <asm/cache.h> -#include <asm/boot.h> - -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(phys_startup_32) -jiffies = jiffies_64; - -PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ -} -SECTIONS -{ - . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; - phys_startup_32 = startup_32 - LOAD_OFFSET; - - .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ - *(.text.head) - } :text = 0x9090 - - /* read-only */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */ - *(.text.page_aligned) - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - _etext = .; /* End of text section */ - } :text = 0x9090 - - NOTES :text :note - - . = ALIGN(16); /* Exception table */ - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - - /* writeable */ - . = ALIGN(PAGE_SIZE); - .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ - DATA_DATA - CONSTRUCTORS - } :data - - . = ALIGN(PAGE_SIZE); - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } - - . = ALIGN(PAGE_SIZE); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) - *(.data.idt) - } - - . = ALIGN(32); - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } - - /* rarely changed data like cpu maps */ - . = ALIGN(32); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - _edata = .; /* End of data section */ - } - - . = ALIGN(THREAD_SIZE); /* init_task */ - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) - } - - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - } - /* will be freed after init - * Following ALIGN() is required to make sure no other data falls on the - * same page where __smp_alt_end is pointing as that page might be freed - * after boot. Always make sure that ALIGN() directive is present after - * the section which contains __smp_alt_end. - */ - . = ALIGN(PAGE_SIZE); - - /* will be freed after init */ - . = ALIGN(PAGE_SIZE); /* Init code and data */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - __init_begin = .; - _sinittext = .; - INIT_TEXT - _einittext = .; - } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - SECURITY_INIT - . = ALIGN(4); - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - . = ALIGN(4); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - /* .exit.text is discard at runtime, not link time, to deal with references - from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } -#if defined(CONFIG_BLK_DEV_INITRD) - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - PERCPU(PAGE_SIZE) - . = ALIGN(PAGE_SIZE); - /* freed after init ends here */ - - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - __init_end = .; - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - . = ALIGN(4); - __bss_stop = .; - } - - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = . ; - . += 64 * 1024 ; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = . ; - } - - .end : AT(ADDR(.end) - LOAD_OFFSET) { - _end = . ; - } - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - } - - STABS_DEBUG - - DWARF_DEBUG -} - -/* - * Build-time check on the image size: - */ -ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") - -#ifdef CONFIG_KEXEC -/* Link time checks */ -#include <asm/kexec.h> - -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") -#endif diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S deleted file mode 100644 index c8742507b03..00000000000 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ /dev/null @@ -1,298 +0,0 @@ -/* ld script to make x86-64 Linux kernel - * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; - */ - -#define LOAD_OFFSET __START_KERNEL_map - -#include <asm-generic/vmlinux.lds.h> -#include <asm/asm-offsets.h> -#include <asm/page_types.h> - -#undef i386 /* in case the preprocessor is a 32bit one */ - -OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") -OUTPUT_ARCH(i386:x86-64) -ENTRY(phys_startup_64) -jiffies_64 = jiffies; -PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - user PT_LOAD FLAGS(7); /* RWE */ - data.init PT_LOAD FLAGS(7); /* RWE */ -#ifdef CONFIG_SMP - percpu PT_LOAD FLAGS(7); /* RWE */ -#endif - data.init2 PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ -} -SECTIONS -{ - . = __START_KERNEL; - phys_startup_64 = startup_64 - LOAD_OFFSET; - .text : AT(ADDR(.text) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ - /* First the code that has to be first for bootstrapping */ - *(.text.head) - _stext = .; - /* Then the rest */ - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - _etext = .; /* End of text section */ - } :text = 0x9090 - - NOTES :text :note - - . = ALIGN(16); /* Exception table */ - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - - . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - CONSTRUCTORS - _edata = .; /* End of data section */ - } :data - - - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - *(.data.cacheline_aligned) - } - . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - } - -#define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) - -#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) -#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) - -#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) -#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - - . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user - __vsyscall_0 = VSYSCALL_VIRT_ADDR; - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) - { *(.vsyscall_gtod_data) } - vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); - .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) - { *(.vsyscall_clock) } - vsyscall_clock = VVIRT(.vsyscall_clock); - - - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) - { *(.vsyscall_1) } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) - { *(.vsyscall_2) } - - .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } - vgetcpu_mode = VVIRT(.vgetcpu_mode); - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } - jiffies = VVIRT(.jiffies); - - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) - { *(.vsyscall_3) } - - . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; - -#undef VSYSCALL_ADDR -#undef VSYSCALL_PHYS_ADDR -#undef VSYSCALL_VIRT_ADDR -#undef VLOAD_OFFSET -#undef VLOAD -#undef VVIRT_OFFSET -#undef VVIRT - - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - . = ALIGN(THREAD_SIZE); /* init_task */ - *(.data.init_task) - }:data.init - - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - *(.data.page_aligned) - } - - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - __smp_alt_begin = .; - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - . = ALIGN(PAGE_SIZE); - __smp_alt_end = .; - } - - . = ALIGN(PAGE_SIZE); /* Init code and data */ - __init_begin = .; /* paired with __init_end */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - INIT_TEXT - _einittext = .; - } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - __initdata_begin = .; - INIT_DATA - __initdata_end = .; - } - - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - . = ALIGN(16); - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - SECURITY_INIT - - . = ALIGN(8); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - . = ALIGN(8); - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - /* .exit.text is discard at runtime, not link time, to deal with references - from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } - -#ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - -#ifdef CONFIG_SMP - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - __data_nosave - should - * start another section data.init2. Also, pda should be at the head of - * percpu area. Preallocate it and define the percpu offset symbol - * so that it can be accessed as a percpu variable. - */ - . = ALIGN(PAGE_SIZE); - PERCPU_VADDR(0, :percpu) -#else - PERCPU(PAGE_SIZE) -#endif - - . = ALIGN(PAGE_SIZE); - __init_end = .; - - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ - - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - __bss_stop = .; - } - - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = . ; - . += 64 * 1024 ; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = . ; - } - - _end = . ; - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) - } - - STABS_DEBUG - - DWARF_DEBUG -} - - /* - * Per-cpu symbols which need to be offset from __per_cpu_load - * for the boot processor. - */ -#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load -INIT_PER_CPU(gdt_page); -INIT_PER_CPU(irq_stack_union); - -/* - * Build-time check on the image size: - */ -ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") - -#ifdef CONFIG_SMP -ASSERT((per_cpu__irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); -#endif - -#ifdef CONFIG_KEXEC -#include <asm/kexec.h> - -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") -#endif diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 44153afc906..25ee06a80aa 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) return; } - /* - * Surround the RDTSC by barriers, to make sure it's not - * speculated to outside the seqlock critical section and - * does not cause time warps: - */ - rdtsc_barrier(); now = vread(); - rdtsc_barrier(); - base = __vsyscall_gtod_data.clock.cycle_last; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index a58504ea78c..8600a09e0c6 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -50,6 +50,9 @@ config KVM_INTEL Provides support for KVM on Intel processors equipped with the VT extensions. + To compile this as a module, choose M here: the module + will be called kvm-intel. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM @@ -57,6 +60,9 @@ config KVM_AMD Provides support for KVM on AMD processors equipped with the AMD-V (SVM) extensions. + To compile this as a module, choose M here: the module + will be called kvm-amd. + config KVM_TRACE bool "KVM trace support" depends on KVM && SYSFS diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index d3ec292f00f..b43c4efafe8 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -14,7 +14,7 @@ endif EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ - i8254.o + i8254.o timer.o obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index c13bb92d315..21f68e00524 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -98,6 +98,40 @@ static int pit_get_gate(struct kvm *kvm, int channel) return kvm->arch.vpit->pit_state.channels[channel].gate; } +static s64 __kpit_elapsed(struct kvm *kvm) +{ + s64 elapsed; + ktime_t remaining; + struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; + + if (!ps->pit_timer.period) + return 0; + + /* + * The Counter does not stop when it reaches zero. In + * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to + * the highest count, either FFFF hex for binary counting + * or 9999 for BCD counting, and continues counting. + * Modes 2 and 3 are periodic; the Counter reloads + * itself with the initial count and continues counting + * from there. + */ + remaining = hrtimer_expires_remaining(&ps->pit_timer.timer); + elapsed = ps->pit_timer.period - ktime_to_ns(remaining); + elapsed = mod_64(elapsed, ps->pit_timer.period); + + return elapsed; +} + +static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c, + int channel) +{ + if (channel == 0) + return __kpit_elapsed(kvm); + + return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); +} + static int pit_get_count(struct kvm *kvm, int channel) { struct kvm_kpit_channel_state *c = @@ -107,7 +141,7 @@ static int pit_get_count(struct kvm *kvm, int channel) WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); + t = kpit_elapsed(kvm, c, channel); d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); switch (c->mode) { @@ -137,7 +171,7 @@ static int pit_get_out(struct kvm *kvm, int channel) WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); + t = kpit_elapsed(kvm, c, channel); d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); switch (c->mode) { @@ -193,28 +227,6 @@ static void pit_latch_status(struct kvm *kvm, int channel) } } -static int __pit_timer_fn(struct kvm_kpit_state *ps) -{ - struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0]; - struct kvm_kpit_timer *pt = &ps->pit_timer; - - if (!atomic_inc_and_test(&pt->pending)) - set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); - - if (!pt->reinject) - atomic_set(&pt->pending, 1); - - if (vcpu0 && waitqueue_active(&vcpu0->wq)) - wake_up_interruptible(&vcpu0->wq); - - hrtimer_add_expires_ns(&pt->timer, pt->period); - pt->scheduled = hrtimer_get_expires_ns(&pt->timer); - if (pt->period) - ps->channels[0].count_load_time = ktime_get(); - - return (pt->period == 0 ? 0 : 1); -} - int pit_has_pending_timer(struct kvm_vcpu *vcpu) { struct kvm_pit *pit = vcpu->kvm->arch.vpit; @@ -235,21 +247,6 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) spin_unlock(&ps->inject_lock); } -static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) -{ - struct kvm_kpit_state *ps; - int restart_timer = 0; - - ps = container_of(data, struct kvm_kpit_state, pit_timer.timer); - - restart_timer = __pit_timer_fn(ps); - - if (restart_timer) - return HRTIMER_RESTART; - else - return HRTIMER_NORESTART; -} - void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) { struct kvm_pit *pit = vcpu->kvm->arch.vpit; @@ -263,15 +260,26 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } -static void destroy_pit_timer(struct kvm_kpit_timer *pt) +static void destroy_pit_timer(struct kvm_timer *pt) { pr_debug("pit: execute del timer!\n"); hrtimer_cancel(&pt->timer); } +static bool kpit_is_periodic(struct kvm_timer *ktimer) +{ + struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state, + pit_timer); + return ps->is_periodic; +} + +static struct kvm_timer_ops kpit_ops = { + .is_periodic = kpit_is_periodic, +}; + static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) { - struct kvm_kpit_timer *pt = &ps->pit_timer; + struct kvm_timer *pt = &ps->pit_timer; s64 interval; interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); @@ -280,8 +288,14 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&pt->timer); - pt->period = (is_period == 0) ? 0 : interval; - pt->timer.function = pit_timer_fn; + pt->period = interval; + ps->is_periodic = is_period; + + pt->timer.function = kvm_timer_fn; + pt->t_ops = &kpit_ops; + pt->kvm = ps->pit->kvm; + pt->vcpu_id = 0; + atomic_set(&pt->pending, 0); ps->irq_ack = 1; @@ -298,23 +312,23 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); /* - * Though spec said the state of 8254 is undefined after power-up, - * seems some tricky OS like Windows XP depends on IRQ0 interrupt - * when booting up. - * So here setting initialize rate for it, and not a specific number + * The largest possible initial count is 0; this is equivalent + * to 216 for binary counting and 104 for BCD counting. */ if (val == 0) val = 0x10000; - ps->channels[channel].count_load_time = ktime_get(); ps->channels[channel].count = val; - if (channel != 0) + if (channel != 0) { + ps->channels[channel].count_load_time = ktime_get(); return; + } /* Two types of timer * mode 1 is one shot, mode 2 is period, otherwise del timer */ switch (ps->channels[0].mode) { + case 0: case 1: /* FIXME: enhance mode 4 precision */ case 4: diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 6acbe4b505d..bbd863ff60b 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -3,15 +3,6 @@ #include "iodev.h" -struct kvm_kpit_timer { - struct hrtimer timer; - int irq; - s64 period; /* unit: ns */ - s64 scheduled; - atomic_t pending; - bool reinject; -}; - struct kvm_kpit_channel_state { u32 count; /* can be 65536 */ u16 latched_count; @@ -30,7 +21,8 @@ struct kvm_kpit_channel_state { struct kvm_kpit_state { struct kvm_kpit_channel_state channels[3]; - struct kvm_kpit_timer pit_timer; + struct kvm_timer pit_timer; + bool is_periodic; u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index cf17ed52f6f..96dfbb6ad2a 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -24,6 +24,7 @@ #include "irq.h" #include "i8254.h" +#include "x86.h" /* * check if there are pending timer events @@ -48,6 +49,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) { struct kvm_pic *s; + if (!irqchip_in_kernel(v->kvm)) + return v->arch.interrupt.pending; + if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ if (kvm_apic_accept_pic_intr(v)) { s = pic_irqchip(v->kvm); /* PIC */ @@ -67,6 +71,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) struct kvm_pic *s; int vector; + if (!irqchip_in_kernel(v->kvm)) + return v->arch.interrupt.nr; + vector = kvm_get_apic_interrupt(v); /* APIC */ if (vector == -1) { if (kvm_apic_accept_pic_intr(v)) { diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h new file mode 100644 index 00000000000..26bd6ba74e1 --- /dev/null +++ b/arch/x86/kvm/kvm_timer.h @@ -0,0 +1,18 @@ + +struct kvm_timer { + struct hrtimer timer; + s64 period; /* unit: ns */ + atomic_t pending; /* accumulated triggered timers */ + bool reinject; + struct kvm_timer_ops *t_ops; + struct kvm *kvm; + int vcpu_id; +}; + +struct kvm_timer_ops { + bool (*is_periodic)(struct kvm_timer *); +}; + + +enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); + diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index f0b67f2cdd6..ae99d83f81a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -196,20 +196,15 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, + int vector, int level, int trig_mode); + +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic_test_and_set_irr(vec, apic)) { - /* a new pending irq is set in IRR */ - if (trig) - apic_set_vector(vec, apic->regs + APIC_TMR); - else - apic_clear_vector(vec, apic->regs + APIC_TMR); - kvm_vcpu_kick(apic->vcpu); - return 1; - } - return 0; + return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, + irq->level, irq->trig_mode); } static inline int apic_find_highest_isr(struct kvm_lapic *apic) @@ -250,7 +245,7 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) { - return kvm_apic_id(apic) == dest; + return dest == 0xff || kvm_apic_id(apic) == dest; } int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) @@ -279,37 +274,34 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) return result; } -static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, +int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, int dest, int dest_mode) { int result = 0; struct kvm_lapic *target = vcpu->arch.apic; apic_debug("target %p, source %p, dest 0x%x, " - "dest_mode 0x%x, short_hand 0x%x", + "dest_mode 0x%x, short_hand 0x%x\n", target, source, dest, dest_mode, short_hand); ASSERT(!target); switch (short_hand) { case APIC_DEST_NOSHORT: - if (dest_mode == 0) { + if (dest_mode == 0) /* Physical mode. */ - if ((dest == 0xFF) || (dest == kvm_apic_id(target))) - result = 1; - } else + result = kvm_apic_match_physical_addr(target, dest); + else /* Logical mode. */ result = kvm_apic_match_logical_addr(target, dest); break; case APIC_DEST_SELF: - if (target == source) - result = 1; + result = (target == source); break; case APIC_DEST_ALLINC: result = 1; break; case APIC_DEST_ALLBUT: - if (target != source) - result = 1; + result = (target != source); break; default: printk(KERN_WARNING "Bad dest shorthand value %x\n", @@ -327,20 +319,22 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode) { - int orig_irr, result = 0; + int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; switch (delivery_mode) { - case APIC_DM_FIXED: case APIC_DM_LOWEST: + vcpu->arch.apic_arb_prio++; + case APIC_DM_FIXED: /* FIXME add logic for vcpu on reset */ if (unlikely(!apic_enabled(apic))) break; - orig_irr = apic_test_and_set_irr(vector, apic); - if (orig_irr && trig_mode) { - apic_debug("level trig mode repeatedly for vector %d", - vector); + result = !apic_test_and_set_irr(vector, apic); + if (!result) { + if (trig_mode) + apic_debug("level trig mode repeatedly for " + "vector %d", vector); break; } @@ -349,10 +343,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic_set_vector(vector, apic->regs + APIC_TMR); } else apic_clear_vector(vector, apic->regs + APIC_TMR); - kvm_vcpu_kick(vcpu); - - result = (orig_irr == 0); break; case APIC_DM_REMRD: @@ -364,12 +355,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; case APIC_DM_NMI: + result = 1; kvm_inject_nmi(vcpu); kvm_vcpu_kick(vcpu); break; case APIC_DM_INIT: if (level) { + result = 1; if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) printk(KERN_DEBUG "INIT on a runnable vcpu %d\n", @@ -386,6 +379,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic_debug("SIPI to vcpu %d vector 0x%02x\n", vcpu->vcpu_id, vector); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + result = 1; vcpu->arch.sipi_vector = vector; vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; kvm_vcpu_kick(vcpu); @@ -408,43 +402,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, return result; } -static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, - unsigned long bitmap) -{ - int last; - int next; - struct kvm_lapic *apic = NULL; - - last = kvm->arch.round_robin_prev_vcpu; - next = last; - - do { - if (++next == KVM_MAX_VCPUS) - next = 0; - if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) - continue; - apic = kvm->vcpus[next]->arch.apic; - if (apic && apic_enabled(apic)) - break; - apic = NULL; - } while (next != last); - kvm->arch.round_robin_prev_vcpu = next; - - if (!apic) - printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); - - return apic; -} - -struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, - unsigned long bitmap) +int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) { - struct kvm_lapic *apic; - - apic = kvm_apic_round_robin(kvm, vector, bitmap); - if (apic) - return apic->vcpu; - return NULL; + return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } static void apic_set_eoi(struct kvm_lapic *apic) @@ -472,47 +432,24 @@ static void apic_send_ipi(struct kvm_lapic *apic) { u32 icr_low = apic_get_reg(apic, APIC_ICR); u32 icr_high = apic_get_reg(apic, APIC_ICR2); + struct kvm_lapic_irq irq; - unsigned int dest = GET_APIC_DEST_FIELD(icr_high); - unsigned int short_hand = icr_low & APIC_SHORT_MASK; - unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG; - unsigned int level = icr_low & APIC_INT_ASSERT; - unsigned int dest_mode = icr_low & APIC_DEST_MASK; - unsigned int delivery_mode = icr_low & APIC_MODE_MASK; - unsigned int vector = icr_low & APIC_VECTOR_MASK; - - struct kvm_vcpu *target; - struct kvm_vcpu *vcpu; - unsigned long lpr_map = 0; - int i; + irq.vector = icr_low & APIC_VECTOR_MASK; + irq.delivery_mode = icr_low & APIC_MODE_MASK; + irq.dest_mode = icr_low & APIC_DEST_MASK; + irq.level = icr_low & APIC_INT_ASSERT; + irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; + irq.shorthand = icr_low & APIC_SHORT_MASK; + irq.dest_id = GET_APIC_DEST_FIELD(icr_high); apic_debug("icr_high 0x%x, icr_low 0x%x, " "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", - icr_high, icr_low, short_hand, dest, - trig_mode, level, dest_mode, delivery_mode, vector); - - for (i = 0; i < KVM_MAX_VCPUS; i++) { - vcpu = apic->vcpu->kvm->vcpus[i]; - if (!vcpu) - continue; - - if (vcpu->arch.apic && - apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { - if (delivery_mode == APIC_DM_LOWEST) - set_bit(vcpu->vcpu_id, &lpr_map); - else - __apic_accept_irq(vcpu->arch.apic, delivery_mode, - vector, level, trig_mode); - } - } + icr_high, icr_low, irq.shorthand, irq.dest_id, + irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, + irq.vector); - if (delivery_mode == APIC_DM_LOWEST) { - target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map); - if (target != NULL) - __apic_accept_irq(target->arch.apic, delivery_mode, - vector, level, trig_mode); - } + kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); } static u32 apic_get_tmcct(struct kvm_lapic *apic) @@ -527,12 +464,13 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) if (apic_get_reg(apic, APIC_TMICT) == 0) return 0; - remaining = hrtimer_expires_remaining(&apic->timer.dev); + remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer); if (ktime_to_ns(remaining) < 0) remaining = ktime_set(0, 0); - ns = mod_64(ktime_to_ns(remaining), apic->timer.period); - tmcct = div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); + ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period); + tmcct = div64_u64(ns, + (APIC_BUS_CYCLE_NS * apic->divide_count)); return tmcct; } @@ -619,25 +557,25 @@ static void update_divide_count(struct kvm_lapic *apic) tdcr = apic_get_reg(apic, APIC_TDCR); tmp1 = tdcr & 0xf; tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; - apic->timer.divide_count = 0x1 << (tmp2 & 0x7); + apic->divide_count = 0x1 << (tmp2 & 0x7); apic_debug("timer divide count is 0x%x\n", - apic->timer.divide_count); + apic->divide_count); } static void start_apic_timer(struct kvm_lapic *apic) { - ktime_t now = apic->timer.dev.base->get_time(); + ktime_t now = apic->lapic_timer.timer.base->get_time(); - apic->timer.period = apic_get_reg(apic, APIC_TMICT) * - APIC_BUS_CYCLE_NS * apic->timer.divide_count; - atomic_set(&apic->timer.pending, 0); + apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) * + APIC_BUS_CYCLE_NS * apic->divide_count; + atomic_set(&apic->lapic_timer.pending, 0); - if (!apic->timer.period) + if (!apic->lapic_timer.period) return; - hrtimer_start(&apic->timer.dev, - ktime_add_ns(now, apic->timer.period), + hrtimer_start(&apic->lapic_timer.timer, + ktime_add_ns(now, apic->lapic_timer.period), HRTIMER_MODE_ABS); apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" @@ -646,9 +584,9 @@ static void start_apic_timer(struct kvm_lapic *apic) "expire @ 0x%016" PRIx64 ".\n", __func__, APIC_BUS_CYCLE_NS, ktime_to_ns(now), apic_get_reg(apic, APIC_TMICT), - apic->timer.period, + apic->lapic_timer.period, ktime_to_ns(ktime_add_ns(now, - apic->timer.period))); + apic->lapic_timer.period))); } static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) @@ -730,7 +668,7 @@ static void apic_mmio_write(struct kvm_io_device *this, apic_set_reg(apic, APIC_LVTT + 0x10 * i, lvt_val | APIC_LVT_MASKED); } - atomic_set(&apic->timer.pending, 0); + atomic_set(&apic->lapic_timer.pending, 0); } break; @@ -762,7 +700,7 @@ static void apic_mmio_write(struct kvm_io_device *this, break; case APIC_TMICT: - hrtimer_cancel(&apic->timer.dev); + hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); return; @@ -802,7 +740,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) if (!vcpu->arch.apic) return; - hrtimer_cancel(&vcpu->arch.apic->timer.dev); + hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); if (vcpu->arch.apic->regs_page) __free_page(vcpu->arch.apic->regs_page); @@ -880,7 +818,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) ASSERT(apic != NULL); /* Stop the timer in case it's a reset to an active apic */ - hrtimer_cancel(&apic->timer.dev); + hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); apic_set_reg(apic, APIC_LVR, APIC_VERSION); @@ -905,11 +843,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } update_divide_count(apic); - atomic_set(&apic->timer.pending, 0); + atomic_set(&apic->lapic_timer.pending, 0); if (vcpu->vcpu_id == 0) vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; apic_update_ppr(apic); + vcpu->arch.apic_arb_prio = 0; + apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, vcpu, kvm_apic_id(apic), @@ -917,16 +857,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_reset); -int kvm_lapic_enabled(struct kvm_vcpu *vcpu) +bool kvm_apic_present(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; - int ret = 0; - - if (!apic) - return 0; - ret = apic_enabled(apic); + return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic); +} - return ret; +int kvm_lapic_enabled(struct kvm_vcpu *vcpu) +{ + return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); } EXPORT_SYMBOL_GPL(kvm_lapic_enabled); @@ -936,22 +874,11 @@ EXPORT_SYMBOL_GPL(kvm_lapic_enabled); *---------------------------------------------------------------------- */ -/* TODO: make sure __apic_timer_fn runs in current pCPU */ -static int __apic_timer_fn(struct kvm_lapic *apic) +static bool lapic_is_periodic(struct kvm_timer *ktimer) { - int result = 0; - wait_queue_head_t *q = &apic->vcpu->wq; - - if(!atomic_inc_and_test(&apic->timer.pending)) - set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); - if (waitqueue_active(q)) - wake_up_interruptible(q); - - if (apic_lvtt_period(apic)) { - result = 1; - hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period); - } - return result; + struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, + lapic_timer); + return apic_lvtt_period(apic); } int apic_has_pending_timer(struct kvm_vcpu *vcpu) @@ -959,7 +886,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) struct kvm_lapic *lapic = vcpu->arch.apic; if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) - return atomic_read(&lapic->timer.pending); + return atomic_read(&lapic->lapic_timer.pending); return 0; } @@ -986,20 +913,9 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) kvm_apic_local_deliver(apic, APIC_LVT0); } -static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) -{ - struct kvm_lapic *apic; - int restart_timer = 0; - - apic = container_of(data, struct kvm_lapic, timer.dev); - - restart_timer = __apic_timer_fn(apic); - - if (restart_timer) - return HRTIMER_RESTART; - else - return HRTIMER_NORESTART; -} +static struct kvm_timer_ops lapic_timer_ops = { + .is_periodic = lapic_is_periodic, +}; int kvm_create_lapic(struct kvm_vcpu *vcpu) { @@ -1024,8 +940,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) memset(apic->regs, 0, PAGE_SIZE); apic->vcpu = vcpu; - hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - apic->timer.dev.function = apic_timer_fn; + hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + apic->lapic_timer.timer.function = kvm_timer_fn; + apic->lapic_timer.t_ops = &lapic_timer_ops; + apic->lapic_timer.kvm = vcpu->kvm; + apic->lapic_timer.vcpu_id = vcpu->vcpu_id; + apic->base_address = APIC_DEFAULT_PHYS_BASE; vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; @@ -1078,9 +999,9 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (apic && atomic_read(&apic->timer.pending) > 0) { + if (apic && atomic_read(&apic->lapic_timer.pending) > 0) { if (kvm_apic_local_deliver(apic, APIC_LVTT)) - atomic_dec(&apic->timer.pending); + atomic_dec(&apic->lapic_timer.pending); } } @@ -1106,7 +1027,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) MSR_IA32_APICBASE_BASE; apic_set_reg(apic, APIC_LVR, APIC_VERSION); apic_update_ppr(apic); - hrtimer_cancel(&apic->timer.dev); + hrtimer_cancel(&apic->lapic_timer.timer); update_divide_count(apic); start_apic_timer(apic); } @@ -1119,7 +1040,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) if (!apic) return; - timer = &apic->timer.dev; + timer = &apic->lapic_timer.timer; if (hrtimer_cancel(timer)) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 45ab6ee7120..a587f8349c4 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -2,18 +2,15 @@ #define __KVM_X86_LAPIC_H #include "iodev.h" +#include "kvm_timer.h" #include <linux/kvm_host.h> struct kvm_lapic { unsigned long base_address; struct kvm_io_device dev; - struct { - atomic_t pending; - s64 period; /* unit: ns */ - u32 divide_count; - struct hrtimer dev; - } timer; + struct kvm_timer lapic_timer; + u32 divide_count; struct kvm_vcpu *vcpu; struct page *regs_page; void *regs; @@ -34,12 +31,13 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); int kvm_lapic_enabled(struct kvm_vcpu *vcpu); +bool kvm_apic_present(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 32cf11e5728..0ef5bb2b404 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -126,6 +126,7 @@ module_param(oos_shadow, bool, 0644); #define PFERR_PRESENT_MASK (1U << 0) #define PFERR_WRITE_MASK (1U << 1) #define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) #define PFERR_FETCH_MASK (1U << 4) #define PT_DIRECTORY_LEVEL 2 @@ -177,7 +178,11 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; -static u64 __read_mostly shadow_mt_mask; + +static inline u64 rsvd_bits(int s, int e) +{ + return ((1ULL << (e - s + 1)) - 1) << s; +} void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) { @@ -193,14 +198,13 @@ void kvm_mmu_set_base_ptes(u64 base_pte) EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask) { shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; - shadow_mt_mask = mt_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -219,11 +223,6 @@ static int is_nx(struct kvm_vcpu *vcpu) return vcpu->arch.shadow_efer & EFER_NX; } -static int is_present_pte(unsigned long pte) -{ - return pte & PT_PRESENT_MASK; -} - static int is_shadow_present_pte(u64 pte) { return pte != shadow_trap_nonpresent_pte @@ -490,16 +489,20 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) * * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc * containing more mappings. + * + * Returns the number of rmap entries before the spte was added or zero if + * the spte was not added. + * */ -static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) +static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) { struct kvm_mmu_page *sp; struct kvm_rmap_desc *desc; unsigned long *rmapp; - int i; + int i, count = 0; if (!is_rmap_pte(*spte)) - return; + return count; gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); sp->gfns[spte - sp->spt] = gfn; @@ -516,8 +519,10 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) } else { rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) + while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) { desc = desc->more; + count += RMAP_EXT; + } if (desc->shadow_ptes[RMAP_EXT-1]) { desc->more = mmu_alloc_rmap_desc(vcpu); desc = desc->more; @@ -526,6 +531,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) ; desc->shadow_ptes[i] = spte; } + return count; } static void rmap_desc_remove_entry(unsigned long *rmapp, @@ -755,6 +761,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) return young; } +#define RMAP_RECYCLE_THRESHOLD 1000 + +static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage) +{ + unsigned long *rmapp; + + gfn = unalias_gfn(vcpu->kvm, gfn); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); + + kvm_unmap_rmapp(vcpu->kvm, rmapp); + kvm_flush_remote_tlbs(vcpu->kvm); +} + int kvm_age_hva(struct kvm *kvm, unsigned long hva) { return kvm_handle_hva(kvm, hva, kvm_age_rmapp); @@ -1074,18 +1093,10 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) return NULL; } -static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - list_del(&sp->oos_link); - --kvm->stat.mmu_unsync_global; -} - static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON(!sp->unsync); sp->unsync = 0; - if (sp->global) - kvm_unlink_unsync_global(kvm, sp); --kvm->stat.mmu_unsync; } @@ -1248,7 +1259,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); sp->gfn = gfn; sp->role = role; - sp->global = 0; hlist_add_head(&sp->hash_link, bucket); if (!direct) { if (rmap_write_protect(vcpu->kvm, gfn)) @@ -1417,24 +1427,25 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) { + int used_pages; + + used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; + used_pages = max(0, used_pages); + /* * If we set the number of mmu pages to be smaller be than the * number of actived pages , we must to free some mmu pages before we * change the value */ - if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > - kvm_nr_mmu_pages) { - int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages - - kvm->arch.n_free_mmu_pages; - - while (n_used_mmu_pages > kvm_nr_mmu_pages) { + if (used_pages > kvm_nr_mmu_pages) { + while (used_pages > kvm_nr_mmu_pages) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); kvm_mmu_zap_page(kvm, page); - n_used_mmu_pages--; + used_pages--; } kvm->arch.n_free_mmu_pages = 0; } @@ -1616,7 +1627,7 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state, return mtrr_state->def_type; } -static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) +u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) { u8 mtrr; @@ -1626,6 +1637,7 @@ static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) mtrr = MTRR_TYPE_WRBACK; return mtrr; } +EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -1646,11 +1658,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) ++vcpu->kvm->stat.mmu_unsync; sp->unsync = 1; - if (sp->global) { - list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages); - ++vcpu->kvm->stat.mmu_unsync_global; - } else - kvm_mmu_mark_parents_unsync(vcpu, sp); + kvm_mmu_mark_parents_unsync(vcpu, sp); mmu_convert_notrap(sp); return 0; @@ -1677,21 +1685,11 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pte_access, int user_fault, int write_fault, int dirty, int largepage, - int global, gfn_t gfn, pfn_t pfn, bool speculative, + gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync) { u64 spte; int ret = 0; - u64 mt_mask = shadow_mt_mask; - struct kvm_mmu_page *sp = page_header(__pa(shadow_pte)); - - if (!global && sp->global) { - sp->global = 0; - if (sp->unsync) { - kvm_unlink_unsync_global(vcpu->kvm, sp); - kvm_mmu_mark_parents_unsync(vcpu, sp); - } - } /* * We don't set the accessed bit, since we sometimes want to see @@ -1711,16 +1709,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, spte |= shadow_user_mask; if (largepage) spte |= PT_PAGE_SIZE_MASK; - if (mt_mask) { - if (!kvm_is_mmio_pfn(pfn)) { - mt_mask = get_memory_type(vcpu, gfn) << - kvm_x86_ops->get_mt_mask_shift(); - mt_mask |= VMX_EPT_IGMT_BIT; - } else - mt_mask = MTRR_TYPE_UNCACHABLE << - kvm_x86_ops->get_mt_mask_shift(); - spte |= mt_mask; - } + if (tdp_enabled) + spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); spte |= (u64)pfn << PAGE_SHIFT; @@ -1765,11 +1756,12 @@ set_pte: static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, - int *ptwrite, int largepage, int global, - gfn_t gfn, pfn_t pfn, bool speculative) + int *ptwrite, int largepage, gfn_t gfn, + pfn_t pfn, bool speculative) { int was_rmapped = 0; int was_writeble = is_writeble_pte(*shadow_pte); + int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" " user_fault %d gfn %lx\n", @@ -1795,7 +1787,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, was_rmapped = 1; } if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, - dirty, largepage, global, gfn, pfn, speculative, true)) { + dirty, largepage, gfn, pfn, speculative, true)) { if (write_fault) *ptwrite = 1; kvm_x86_ops->tlb_flush(vcpu); @@ -1811,9 +1803,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, page_header_update_slot(vcpu->kvm, shadow_pte, gfn); if (!was_rmapped) { - rmap_add(vcpu, shadow_pte, gfn, largepage); + rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage); if (!is_rmap_pte(*shadow_pte)) kvm_release_pfn_clean(pfn); + if (rmap_count > RMAP_RECYCLE_THRESHOLD) + rmap_recycle(vcpu, gfn, largepage); } else { if (was_writeble) kvm_release_pfn_dirty(pfn); @@ -1843,7 +1837,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 0, write, 1, &pt_write, - largepage, 0, gfn, pfn, false); + largepage, gfn, pfn, false); ++vcpu->stat.pf_fixed; break; } @@ -1942,7 +1936,19 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.root_hpa = INVALID_PAGE; } -static void mmu_alloc_roots(struct kvm_vcpu *vcpu) +static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) +{ + int ret = 0; + + if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + ret = 1; + } + + return ret; +} + +static int mmu_alloc_roots(struct kvm_vcpu *vcpu) { int i; gfn_t root_gfn; @@ -1957,13 +1963,15 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); if (tdp_enabled) direct = 1; + if (mmu_check_root(vcpu, root_gfn)) + return 1; sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; vcpu->arch.mmu.root_hpa = root; - return; + return 0; } direct = !is_paging(vcpu); if (tdp_enabled) @@ -1980,6 +1988,8 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; } else if (vcpu->arch.mmu.root_level == 0) root_gfn = 0; + if (mmu_check_root(vcpu, root_gfn)) + return 1; sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); @@ -1988,6 +1998,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; } vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + return 0; } static void mmu_sync_roots(struct kvm_vcpu *vcpu) @@ -2006,7 +2017,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; - if (root) { + if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; sp = page_header(root); mmu_sync_children(vcpu, sp); @@ -2014,15 +2025,6 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) } } -static void mmu_sync_global(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_mmu_page *sp, *n; - - list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link) - kvm_sync_page(vcpu, sp); -} - void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { spin_lock(&vcpu->kvm->mmu_lock); @@ -2030,13 +2032,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); } -void kvm_mmu_sync_global(struct kvm_vcpu *vcpu) -{ - spin_lock(&vcpu->kvm->mmu_lock); - mmu_sync_global(vcpu); - spin_unlock(&vcpu->kvm->mmu_lock); -} - static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) { return vaddr; @@ -2151,6 +2146,14 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } +static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) +{ + int bit7; + + bit7 = (gpte >> 7) & 1; + return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; +} + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE @@ -2159,6 +2162,59 @@ static void paging_free(struct kvm_vcpu *vcpu) #include "paging_tmpl.h" #undef PTTYPE +static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + int maxphyaddr = cpuid_maxphyaddr(vcpu); + u64 exb_bit_rsvd = 0; + + if (!is_nx(vcpu)) + exb_bit_rsvd = rsvd_bits(63, 63); + switch (level) { + case PT32_ROOT_LEVEL: + /* no rsvd bits for 2 level 4K page table entries */ + context->rsvd_bits_mask[0][1] = 0; + context->rsvd_bits_mask[0][0] = 0; + if (is_cpuid_PSE36()) + /* 36bits PSE 4MB page */ + context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); + else + /* 32 bits PSE 4MB page */ + context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; + break; + case PT32E_ROOT_LEVEL: + context->rsvd_bits_mask[0][2] = + rsvd_bits(maxphyaddr, 63) | + rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ + context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62); /* PDE */ + context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62); /* PTE */ + context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62) | + rsvd_bits(13, 20); /* large page */ + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; + break; + case PT64_ROOT_LEVEL: + context->rsvd_bits_mask[0][3] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + context->rsvd_bits_mask[0][2] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51); + context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51); + context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; + context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2]; + context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | + rsvd_bits(13, 20); /* large page */ + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; + break; + } +} + static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) { struct kvm_mmu *context = &vcpu->arch.mmu; @@ -2179,6 +2235,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) static int paging64_init_context(struct kvm_vcpu *vcpu) { + reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); } @@ -2186,6 +2243,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; + reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; @@ -2201,6 +2259,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) static int paging32E_init_context(struct kvm_vcpu *vcpu) { + reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); } @@ -2221,12 +2280,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = nonpaging_gva_to_gpa; context->root_level = 0; } else if (is_long_mode(vcpu)) { + reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT64_ROOT_LEVEL; } else if (is_pae(vcpu)) { + reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT32E_ROOT_LEVEL; } else { + reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); context->gva_to_gpa = paging32_gva_to_gpa; context->root_level = PT32_ROOT_LEVEL; } @@ -2290,9 +2352,11 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) goto out; spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); - mmu_alloc_roots(vcpu); + r = mmu_alloc_roots(vcpu); mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); + if (r) + goto out; kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); kvm_mmu_flush_tlb(vcpu); out: @@ -2638,14 +2702,6 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp); static void free_mmu_pages(struct kvm_vcpu *vcpu) { - struct kvm_mmu_page *sp; - - while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) { - sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, - struct kvm_mmu_page, link); - kvm_mmu_zap_page(vcpu->kvm, sp); - cond_resched(); - } free_page((unsigned long)vcpu->arch.mmu.pae_root); } @@ -2710,7 +2766,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; - spin_lock(&kvm->mmu_lock); list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { int i; u64 *pt; @@ -2725,7 +2780,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) pt[i] &= ~PT_WRITABLE_MASK; } kvm_flush_remote_tlbs(kvm); - spin_unlock(&kvm->mmu_lock); } void kvm_mmu_zap_all(struct kvm *kvm) @@ -3007,11 +3061,13 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, " in nonleaf level: levels %d gva %lx" " level %d pte %llx\n", audit_msg, vcpu->arch.mmu.root_level, va, level, ent); - - audit_mappings_page(vcpu, ent, va, level - 1); + else + audit_mappings_page(vcpu, ent, va, level - 1); } else { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); - hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT; + gfn_t gfn = gpa >> PAGE_SHIFT; + pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); + hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; if (is_shadow_present_pte(ent) && (ent & PT64_BASE_ADDR_MASK) != hpa) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index eaab2145f62..3494a2fb136 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -75,4 +75,9 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return vcpu->arch.cr0 & X86_CR0_PG; } +static inline int is_present_pte(unsigned long pte) +{ + return pte & PT_PRESENT_MASK; +} + #endif diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6bd70206c56..67785f63539 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -123,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, gfn_t table_gfn; unsigned index, pt_access, pte_access; gpa_t pte_gpa; + int rsvd_fault = 0; pgprintk("%s: addr %lx\n", __func__, addr); walk: @@ -157,6 +158,10 @@ walk: if (!is_present_pte(pte)) goto not_present; + rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); + if (rsvd_fault) + goto access_error; + if (write_fault && !is_writeble_pte(pte)) if (user_fault || is_write_protection(vcpu)) goto access_error; @@ -209,7 +214,6 @@ walk: if (ret) goto walk; pte |= PT_DIRTY_MASK; - kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0); walker->ptes[walker->level - 1] = pte; } @@ -233,6 +237,8 @@ err: walker->error_code |= PFERR_USER_MASK; if (fetch_fault) walker->error_code |= PFERR_FETCH_MASK; + if (rsvd_fault) + walker->error_code |= PFERR_RSVD_MASK; return 0; } @@ -262,8 +268,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, kvm_get_pfn(pfn); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, gpte & PT_DIRTY_MASK, NULL, largepage, - gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte), - pfn, true); + gpte_to_gfn(gpte), pfn, true); } /* @@ -276,7 +281,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, { unsigned access = gw->pt_access; struct kvm_mmu_page *shadow_page; - u64 spte, *sptep; + u64 spte, *sptep = NULL; int direct; gfn_t table_gfn; int r; @@ -297,7 +302,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, user_fault, write_fault, gw->ptes[gw->level-1] & PT_DIRTY_MASK, ptwrite, largepage, - gw->ptes[gw->level-1] & PT_GLOBAL_MASK, gw->gfn, pfn, false); break; } @@ -380,7 +384,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return r; /* - * Look up the shadow pte for the faulting address. + * Look up the guest pte for the faulting address. */ r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, fetch_fault); @@ -586,7 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nr_present++; pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, - is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn, + is_dirty_pte(gpte), 0, gfn, spte_to_pfn(sp->spt[i]), true, false); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1f8510c51d6..b1f658ad2f0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -19,6 +19,7 @@ #include "irq.h" #include "mmu.h" #include "kvm_cache_regs.h" +#include "x86.h" #include <linux/module.h> #include <linux/kernel.h> @@ -69,7 +70,6 @@ module_param(npt, int, S_IRUGO); static int nested = 0; module_param(nested, int, S_IRUGO); -static void kvm_reput_irq(struct vcpu_svm *svm); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); @@ -132,24 +132,6 @@ static inline u32 svm_has(u32 feat) return svm_features & feat; } -static inline u8 pop_irq(struct kvm_vcpu *vcpu) -{ - int word_index = __ffs(vcpu->arch.irq_summary); - int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); - int irq = word_index * BITS_PER_LONG + bit_index; - - clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); - if (!vcpu->arch.irq_pending[word_index]) - clear_bit(word_index, &vcpu->arch.irq_summary); - return irq; -} - -static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) -{ - set_bit(irq, vcpu->arch.irq_pending); - set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); -} - static inline void clgi(void) { asm volatile (__ex(SVM_CLGI)); @@ -214,17 +196,31 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, svm->vmcb->control.event_inj_err = error_code; } -static bool svm_exception_injected(struct kvm_vcpu *vcpu) +static int is_external_interrupt(u32 info) +{ + info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; + return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); +} + +static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) { struct vcpu_svm *svm = to_svm(vcpu); + u32 ret = 0; - return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID); + if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) + ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS; + return ret & mask; } -static int is_external_interrupt(u32 info) +static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) { - info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; - return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); + struct vcpu_svm *svm = to_svm(vcpu); + + if (mask == 0) + svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; + else + svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; + } static void skip_emulated_instruction(struct kvm_vcpu *vcpu) @@ -232,7 +228,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); if (!svm->next_rip) { - printk(KERN_DEBUG "%s: NOP\n", __func__); + if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) != + EMULATE_DONE) + printk(KERN_DEBUG "%s: NOP\n", __func__); return; } if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) @@ -240,9 +238,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) __func__, kvm_rip_read(vcpu), svm->next_rip); kvm_rip_write(vcpu, svm->next_rip); - svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; - - vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK); + svm_set_interrupt_shadow(vcpu, 0); } static int has_svm(void) @@ -715,6 +711,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) svm->vmcb->control.tsc_offset += delta; vcpu->cpu = cpu; kvm_migrate_timers(vcpu); + svm->asid_generation = 0; } for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) @@ -830,6 +827,15 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, if (!var->unusable) var->type |= 0x1; break; + case VCPU_SREG_SS: + /* On AMD CPUs sometimes the DB bit in the segment + * descriptor is left as 1, although the whole segment has + * been made unusable. Clear it here to pass an Intel VMX + * entry check when cross vendor migrating. + */ + if (var->unusable) + var->db = 0; + break; } } @@ -960,15 +966,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, } -static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +static void update_db_intercept(struct kvm_vcpu *vcpu) { - int old_debug = vcpu->guest_debug; struct vcpu_svm *svm = to_svm(vcpu); - vcpu->guest_debug = dbg->control; - svm->vmcb->control.intercept_exceptions &= ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); + + if (vcpu->arch.singlestep) + svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); + if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { if (vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) @@ -979,6 +986,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1 << BP_VECTOR; } else vcpu->guest_debug = 0; +} + +static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +{ + int old_debug = vcpu->guest_debug; + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->guest_debug = dbg->control; + + update_db_intercept(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; @@ -993,16 +1010,6 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) return 0; } -static int svm_get_irq(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u32 exit_int_info = svm->vmcb->control.exit_int_info; - - if (is_external_interrupt(exit_int_info)) - return exit_int_info & SVM_EVTINJ_VEC_MASK; - return -1; -} - static void load_host_msrs(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 @@ -1025,7 +1032,6 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; } - svm->vcpu.cpu = svm_data->cpu; svm->asid_generation = svm_data->asid_generation; svm->vmcb->control.asid = svm_data->next_asid++; } @@ -1107,17 +1113,8 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - u32 exit_int_info = svm->vmcb->control.exit_int_info; - struct kvm *kvm = svm->vcpu.kvm; u64 fault_address; u32 error_code; - bool event_injection = false; - - if (!irqchip_in_kernel(kvm) && - is_external_interrupt(exit_int_info)) { - event_injection = true; - push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); - } fault_address = svm->vmcb->control.exit_info_2; error_code = svm->vmcb->control.exit_info_1; @@ -1137,23 +1134,40 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) */ if (npt_enabled) svm_flush_tlb(&svm->vcpu); - - if (!npt_enabled && event_injection) - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); + else { + if (kvm_event_needs_reinjection(&svm->vcpu)) + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); + } return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { if (!(svm->vcpu.guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && + !svm->vcpu.arch.singlestep) { kvm_queue_exception(&svm->vcpu, DB_VECTOR); return 1; } - kvm_run->exit_reason = KVM_EXIT_DEBUG; - kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; - kvm_run->debug.arch.exception = DB_VECTOR; - return 0; + + if (svm->vcpu.arch.singlestep) { + svm->vcpu.arch.singlestep = false; + if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) + svm->vmcb->save.rflags &= + ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + update_db_intercept(&svm->vcpu); + } + + if (svm->vcpu.guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){ + kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->debug.arch.pc = + svm->vmcb->save.cs.base + svm->vmcb->save.rip; + kvm_run->debug.arch.exception = DB_VECTOR; + return 0; + } + + return 1; } static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) @@ -1842,17 +1856,51 @@ static int task_switch_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { u16 tss_selector; + int reason; + int int_type = svm->vmcb->control.exit_int_info & + SVM_EXITINTINFO_TYPE_MASK; + int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK; + uint32_t type = + svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; + uint32_t idt_v = + svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; tss_selector = (u16)svm->vmcb->control.exit_info_1; + if (svm->vmcb->control.exit_info_2 & (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) - return kvm_task_switch(&svm->vcpu, tss_selector, - TASK_SWITCH_IRET); - if (svm->vmcb->control.exit_info_2 & - (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) - return kvm_task_switch(&svm->vcpu, tss_selector, - TASK_SWITCH_JMP); - return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL); + reason = TASK_SWITCH_IRET; + else if (svm->vmcb->control.exit_info_2 & + (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) + reason = TASK_SWITCH_JMP; + else if (idt_v) + reason = TASK_SWITCH_GATE; + else + reason = TASK_SWITCH_CALL; + + if (reason == TASK_SWITCH_GATE) { + switch (type) { + case SVM_EXITINTINFO_TYPE_NMI: + svm->vcpu.arch.nmi_injected = false; + break; + case SVM_EXITINTINFO_TYPE_EXEPT: + kvm_clear_exception_queue(&svm->vcpu); + break; + case SVM_EXITINTINFO_TYPE_INTR: + kvm_clear_interrupt_queue(&svm->vcpu); + break; + default: + break; + } + } + + if (reason != TASK_SWITCH_GATE || + int_type == SVM_EXITINTINFO_TYPE_SOFT || + (int_type == SVM_EXITINTINFO_TYPE_EXEPT && + (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) + skip_emulated_instruction(&svm->vcpu); + + return kvm_task_switch(&svm->vcpu, tss_selector, reason); } static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) @@ -1862,6 +1910,14 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + ++svm->vcpu.stat.nmi_window_exits; + svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); + svm->vcpu.arch.hflags |= HF_IRET_MASK; + return 1; +} + static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) @@ -1879,8 +1935,14 @@ static int emulate_on_interception(struct vcpu_svm *svm, static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { + u8 cr8_prev = kvm_get_cr8(&svm->vcpu); + /* instruction emulation calls kvm_set_cr8() */ emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); - if (irqchip_in_kernel(svm->vcpu.kvm)) + if (irqchip_in_kernel(svm->vcpu.kvm)) { + svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; + return 1; + } + if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) return 1; kvm_run->exit_reason = KVM_EXIT_SET_TPR; return 0; @@ -2090,8 +2152,9 @@ static int interrupt_window_interception(struct vcpu_svm *svm, * If the user space waits to inject interrupts, exit as soon as * possible */ - if (kvm_run->request_interrupt_window && - !svm->vcpu.arch.irq_summary) { + if (!irqchip_in_kernel(svm->vcpu.kvm) && + kvm_run->request_interrupt_window && + !kvm_cpu_has_interrupt(&svm->vcpu)) { ++svm->vcpu.stat.irq_window_exits; kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; @@ -2134,6 +2197,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_VINTR] = interrupt_window_interception, /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ [SVM_EXIT_CPUID] = cpuid_interception, + [SVM_EXIT_IRET] = iret_interception, [SVM_EXIT_INVD] = emulate_on_interception, [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, @@ -2194,7 +2258,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } } - kvm_reput_irq(svm); if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -2205,7 +2268,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if (is_external_interrupt(svm->vmcb->control.exit_int_info) && exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && - exit_code != SVM_EXIT_NPF) + exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH) printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " "exit_code 0x%x\n", __func__, svm->vmcb->control.exit_int_info, @@ -2237,11 +2300,20 @@ static void pre_svm_run(struct vcpu_svm *svm) struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; - if (svm->vcpu.cpu != cpu || - svm->asid_generation != svm_data->asid_generation) + /* FIXME: handle wraparound of asid_generation */ + if (svm->asid_generation != svm_data->asid_generation) new_asid(svm, svm_data); } +static void svm_inject_nmi(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; + vcpu->arch.hflags |= HF_NMI_MASK; + svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); + ++vcpu->stat.nmi_injections; +} static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { @@ -2257,134 +2329,71 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); } -static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) +static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr) { struct vcpu_svm *svm = to_svm(vcpu); - nested_svm_intr(svm); - - svm_inject_irq(svm, irq); + svm->vmcb->control.event_inj = nr | + SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } -static void update_cr8_intercept(struct kvm_vcpu *vcpu) +static void svm_set_irq(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *vmcb = svm->vmcb; - int max_irr, tpr; - if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr) - return; + nested_svm_intr(svm); - vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; + svm_queue_irq(vcpu, vcpu->arch.interrupt.nr); +} - max_irr = kvm_lapic_find_highest_irr(vcpu); - if (max_irr == -1) - return; +static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +{ + struct vcpu_svm *svm = to_svm(vcpu); - tpr = kvm_lapic_get_cr8(vcpu) << 4; + if (irr == -1) + return; - if (tpr >= (max_irr & 0xf0)) - vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; + if (tpr >= irr) + svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; } -static void svm_intr_assist(struct kvm_vcpu *vcpu) +static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - int intr_vector = -1; - - if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && - ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { - intr_vector = vmcb->control.exit_int_info & - SVM_EVTINJ_VEC_MASK; - vmcb->control.exit_int_info = 0; - svm_inject_irq(svm, intr_vector); - goto out; - } - - if (vmcb->control.int_ctl & V_IRQ_MASK) - goto out; - - if (!kvm_cpu_has_interrupt(vcpu)) - goto out; - - if (nested_svm_intr(svm)) - goto out; - - if (!(svm->vcpu.arch.hflags & HF_GIF_MASK)) - goto out; - - if (!(vmcb->save.rflags & X86_EFLAGS_IF) || - (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || - (vmcb->control.event_inj & SVM_EVTINJ_VALID)) { - /* unable to deliver irq, set pending irq */ - svm_set_vintr(svm); - svm_inject_irq(svm, 0x0); - goto out; - } - /* Okay, we can deliver the interrupt: grab it and update PIC state. */ - intr_vector = kvm_cpu_get_interrupt(vcpu); - svm_inject_irq(svm, intr_vector); -out: - update_cr8_intercept(vcpu); + return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && + !(svm->vcpu.arch.hflags & HF_NMI_MASK); } -static void kvm_reput_irq(struct vcpu_svm *svm) +static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) { - struct vmcb_control_area *control = &svm->vmcb->control; - - if ((control->int_ctl & V_IRQ_MASK) - && !irqchip_in_kernel(svm->vcpu.kvm)) { - control->int_ctl &= ~V_IRQ_MASK; - push_irq(&svm->vcpu, control->int_vector); - } - - svm->vcpu.arch.interrupt_window_open = - !(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && - (svm->vcpu.arch.hflags & HF_GIF_MASK); + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; + return (vmcb->save.rflags & X86_EFLAGS_IF) && + !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && + (svm->vcpu.arch.hflags & HF_GIF_MASK); } -static void svm_do_inject_vector(struct vcpu_svm *svm) +static void enable_irq_window(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; - int word_index = __ffs(vcpu->arch.irq_summary); - int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); - int irq = word_index * BITS_PER_LONG + bit_index; - - clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); - if (!vcpu->arch.irq_pending[word_index]) - clear_bit(word_index, &vcpu->arch.irq_summary); - svm_inject_irq(svm, irq); + svm_set_vintr(to_svm(vcpu)); + svm_inject_irq(to_svm(vcpu), 0x0); } -static void do_interrupt_requests(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static void enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb_control_area *control = &svm->vmcb->control; - - if (nested_svm_intr(svm)) - return; - svm->vcpu.arch.interrupt_window_open = - (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && - (svm->vmcb->save.rflags & X86_EFLAGS_IF) && - (svm->vcpu.arch.hflags & HF_GIF_MASK)); + if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) + == HF_NMI_MASK) + return; /* IRET will cause a vm exit */ - if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) - /* - * If interrupts enabled, and not blocked by sti or mov ss. Good. - */ - svm_do_inject_vector(svm); - - /* - * Interrupts blocked. Wait for unblock. - */ - if (!svm->vcpu.arch.interrupt_window_open && - (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window)) - svm_set_vintr(svm); - else - svm_clear_vintr(svm); + /* Something prevents NMI from been injected. Single step over + possible problem (IRET or exception injection or interrupt + shadow) */ + vcpu->arch.singlestep = true; + svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); + update_db_intercept(vcpu); } static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -2407,7 +2416,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; - kvm_lapic_set_tpr(vcpu, cr8); + kvm_set_cr8(vcpu, cr8); } } @@ -2416,14 +2425,54 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; - if (!irqchip_in_kernel(vcpu->kvm)) - return; - cr8 = kvm_get_cr8(vcpu); svm->vmcb->control.int_ctl &= ~V_TPR_MASK; svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; } +static void svm_complete_interrupts(struct vcpu_svm *svm) +{ + u8 vector; + int type; + u32 exitintinfo = svm->vmcb->control.exit_int_info; + + if (svm->vcpu.arch.hflags & HF_IRET_MASK) + svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); + + svm->vcpu.arch.nmi_injected = false; + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); + + if (!(exitintinfo & SVM_EXITINTINFO_VALID)) + return; + + vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; + type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; + + switch (type) { + case SVM_EXITINTINFO_TYPE_NMI: + svm->vcpu.arch.nmi_injected = true; + break; + case SVM_EXITINTINFO_TYPE_EXEPT: + /* In case of software exception do not reinject an exception + vector, but re-execute and instruction instead */ + if (kvm_exception_is_soft(vector)) + break; + if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { + u32 err = svm->vmcb->control.exit_int_info_err; + kvm_queue_exception_e(&svm->vcpu, vector, err); + + } else + kvm_queue_exception(&svm->vcpu, vector); + break; + case SVM_EXITINTINFO_TYPE_INTR: + kvm_queue_interrupt(&svm->vcpu, vector, false); + break; + default: + break; + } +} + #ifdef CONFIG_X86_64 #define R "r" #else @@ -2552,6 +2601,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) sync_cr8_to_lapic(vcpu); svm->next_rip = 0; + + svm_complete_interrupts(svm); } #undef R @@ -2617,7 +2668,7 @@ static int get_npt_level(void) #endif } -static int svm_get_mt_mask_shift(void) +static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { return 0; } @@ -2667,17 +2718,21 @@ static struct kvm_x86_ops svm_x86_ops = { .run = svm_vcpu_run, .handle_exit = handle_exit, .skip_emulated_instruction = skip_emulated_instruction, + .set_interrupt_shadow = svm_set_interrupt_shadow, + .get_interrupt_shadow = svm_get_interrupt_shadow, .patch_hypercall = svm_patch_hypercall, - .get_irq = svm_get_irq, .set_irq = svm_set_irq, + .set_nmi = svm_inject_nmi, .queue_exception = svm_queue_exception, - .exception_injected = svm_exception_injected, - .inject_pending_irq = svm_intr_assist, - .inject_pending_vectors = do_interrupt_requests, + .interrupt_allowed = svm_interrupt_allowed, + .nmi_allowed = svm_nmi_allowed, + .enable_nmi_window = enable_nmi_window, + .enable_irq_window = enable_irq_window, + .update_cr8_intercept = update_cr8_intercept, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, - .get_mt_mask_shift = svm_get_mt_mask_shift, + .get_mt_mask = svm_get_mt_mask, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c new file mode 100644 index 00000000000..86dbac072d0 --- /dev/null +++ b/arch/x86/kvm/timer.c @@ -0,0 +1,46 @@ +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/hrtimer.h> +#include <asm/atomic.h> +#include "kvm_timer.h" + +static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) +{ + int restart_timer = 0; + wait_queue_head_t *q = &vcpu->wq; + + /* FIXME: this code should not know anything about vcpus */ + if (!atomic_inc_and_test(&ktimer->pending)) + set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + + if (!ktimer->reinject) + atomic_set(&ktimer->pending, 1); + + if (waitqueue_active(q)) + wake_up_interruptible(q); + + if (ktimer->t_ops->is_periodic(ktimer)) { + hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); + restart_timer = 1; + } + + return restart_timer; +} + +enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) +{ + int restart_timer; + struct kvm_vcpu *vcpu; + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); + + vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id]; + if (!vcpu) + return HRTIMER_NORESTART; + + restart_timer = __kvm_timer_fn(vcpu, ktimer); + if (restart_timer) + return HRTIMER_RESTART; + else + return HRTIMER_NORESTART; +} + diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bb481330716..29f912927a5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -32,26 +32,27 @@ #include <asm/desc.h> #include <asm/vmx.h> #include <asm/virtext.h> +#include <asm/mce.h> #define __ex(x) __kvm_handle_fault_on_reboot(x) MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); -static int bypass_guest_pf = 1; -module_param(bypass_guest_pf, bool, 0); +static int __read_mostly bypass_guest_pf = 1; +module_param(bypass_guest_pf, bool, S_IRUGO); -static int enable_vpid = 1; -module_param(enable_vpid, bool, 0); +static int __read_mostly enable_vpid = 1; +module_param_named(vpid, enable_vpid, bool, 0444); -static int flexpriority_enabled = 1; -module_param(flexpriority_enabled, bool, 0); +static int __read_mostly flexpriority_enabled = 1; +module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); -static int enable_ept = 1; -module_param(enable_ept, bool, 0); +static int __read_mostly enable_ept = 1; +module_param_named(ept, enable_ept, bool, S_IRUGO); -static int emulate_invalid_guest_state = 0; -module_param(emulate_invalid_guest_state, bool, 0); +static int __read_mostly emulate_invalid_guest_state = 0; +module_param(emulate_invalid_guest_state, bool, S_IRUGO); struct vmcs { u32 revision_id; @@ -97,6 +98,7 @@ struct vcpu_vmx { int soft_vnmi_blocked; ktime_t entry_time; s64 vnmi_blocked_time; + u32 exit_reason; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -111,9 +113,10 @@ static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); -static struct page *vmx_io_bitmap_a; -static struct page *vmx_io_bitmap_b; -static struct page *vmx_msr_bitmap; +static unsigned long *vmx_io_bitmap_a; +static unsigned long *vmx_io_bitmap_b; +static unsigned long *vmx_msr_bitmap_legacy; +static unsigned long *vmx_msr_bitmap_longmode; static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); @@ -213,70 +216,78 @@ static inline int is_external_interrupt(u32 intr_info) == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); } +static inline int is_machine_check(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); +} + static inline int cpu_has_vmx_msr_bitmap(void) { - return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS); + return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; } static inline int cpu_has_vmx_tpr_shadow(void) { - return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); + return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; } static inline int vm_need_tpr_shadow(struct kvm *kvm) { - return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); + return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); } static inline int cpu_has_secondary_exec_ctrls(void) { - return (vmcs_config.cpu_based_exec_ctrl & - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); + return vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; } static inline bool cpu_has_vmx_virtualize_apic_accesses(void) { - return flexpriority_enabled - && (vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; +} + +static inline bool cpu_has_vmx_flexpriority(void) +{ + return cpu_has_vmx_tpr_shadow() && + cpu_has_vmx_virtualize_apic_accesses(); } static inline int cpu_has_vmx_invept_individual_addr(void) { - return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT)); + return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); } static inline int cpu_has_vmx_invept_context(void) { - return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT)); + return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT); } static inline int cpu_has_vmx_invept_global(void) { - return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT)); + return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT); } static inline int cpu_has_vmx_ept(void) { - return (vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_EPT); -} - -static inline int vm_need_ept(void) -{ - return (cpu_has_vmx_ept() && enable_ept); + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_EPT; } static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { - return ((cpu_has_vmx_virtualize_apic_accesses()) && - (irqchip_in_kernel(kvm))); + return flexpriority_enabled && + (cpu_has_vmx_virtualize_apic_accesses()) && + (irqchip_in_kernel(kvm)); } static inline int cpu_has_vmx_vpid(void) { - return (vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_VPID); + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_VPID; } static inline int cpu_has_virtual_nmis(void) @@ -284,6 +295,11 @@ static inline int cpu_has_virtual_nmis(void) return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; } +static inline bool report_flexpriority(void) +{ + return flexpriority_enabled; +} + static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -381,7 +397,7 @@ static inline void ept_sync_global(void) static inline void ept_sync_context(u64 eptp) { - if (vm_need_ept()) { + if (enable_ept) { if (cpu_has_vmx_invept_context()) __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); else @@ -391,7 +407,7 @@ static inline void ept_sync_context(u64 eptp) static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) { - if (vm_need_ept()) { + if (enable_ept) { if (cpu_has_vmx_invept_individual_addr()) __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, eptp, gpa); @@ -478,7 +494,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); if (!vcpu->fpu_active) eb |= 1u << NM_VECTOR; if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { @@ -488,9 +504,9 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) eb |= 1u << BP_VECTOR; } - if (vcpu->arch.rmode.active) + if (vcpu->arch.rmode.vm86_active) eb = ~0; - if (vm_need_ept()) + if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -724,29 +740,50 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - if (vcpu->arch.rmode.active) + if (vcpu->arch.rmode.vm86_active) rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; vmcs_writel(GUEST_RFLAGS, rflags); } +static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + int ret = 0; + + if (interruptibility & GUEST_INTR_STATE_STI) + ret |= X86_SHADOW_INT_STI; + if (interruptibility & GUEST_INTR_STATE_MOV_SS) + ret |= X86_SHADOW_INT_MOV_SS; + + return ret & mask; +} + +static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + u32 interruptibility = interruptibility_old; + + interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); + + if (mask & X86_SHADOW_INT_MOV_SS) + interruptibility |= GUEST_INTR_STATE_MOV_SS; + if (mask & X86_SHADOW_INT_STI) + interruptibility |= GUEST_INTR_STATE_STI; + + if ((interruptibility != interruptibility_old)) + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); +} + static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip; - u32 interruptibility; rip = kvm_rip_read(vcpu); rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_rip_write(vcpu, rip); - /* - * We emulated an instruction, so temporary interrupt blocking - * should be removed, if set. - */ - interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - if (interruptibility & 3) - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - interruptibility & ~3); - vcpu->arch.interrupt_window_open = 1; + /* skipping an emulated instruction also counts */ + vmx_set_interrupt_shadow(vcpu, 0); } static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, @@ -760,7 +797,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, intr_info |= INTR_INFO_DELIVER_CODE_MASK; } - if (vcpu->arch.rmode.active) { + if (vcpu->arch.rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = nr; vmx->rmode.irq.rip = kvm_rip_read(vcpu); @@ -773,8 +810,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, return; } - if (nr == BP_VECTOR || nr == OF_VECTOR) { - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + if (kvm_exception_is_soft(nr)) { + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmx->vcpu.arch.event_exit_inst_len); intr_info |= INTR_TYPE_SOFT_EXCEPTION; } else intr_info |= INTR_TYPE_HARD_EXCEPTION; @@ -782,11 +820,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); } -static bool vmx_exception_injected(struct kvm_vcpu *vcpu) -{ - return false; -} - /* * Swap MSR entry in host/guest MSR entry array. */ @@ -812,6 +845,7 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) static void setup_msrs(struct vcpu_vmx *vmx) { int save_nmsrs; + unsigned long *msr_bitmap; vmx_load_host_state(vmx); save_nmsrs = 0; @@ -847,6 +881,15 @@ static void setup_msrs(struct vcpu_vmx *vmx) __find_msr_index(vmx, MSR_KERNEL_GS_BASE); #endif vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); + + if (cpu_has_vmx_msr_bitmap()) { + if (is_long_mode(&vmx->vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode; + else + msr_bitmap = vmx_msr_bitmap_legacy; + + vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); + } } /* @@ -1034,13 +1077,6 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) return 0; } -static int vmx_get_irq(struct kvm_vcpu *vcpu) -{ - if (!vcpu->arch.interrupt.pending) - return -1; - return vcpu->arch.interrupt.nr; -} - static __init int cpu_has_kvm_support(void) { return cpu_has_vmx(); @@ -1241,7 +1277,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) struct page *pages; struct vmcs *vmcs; - pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); + pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order); if (!pages) return NULL; vmcs = page_address(pages); @@ -1294,6 +1330,18 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); + if (!cpu_has_vmx_vpid()) + enable_vpid = 0; + + if (!cpu_has_vmx_ept()) + enable_ept = 0; + + if (!cpu_has_vmx_flexpriority()) + flexpriority_enabled = 0; + + if (!cpu_has_vmx_tpr_shadow()) + kvm_x86_ops->update_cr8_intercept = NULL; + return alloc_kvm_area(); } @@ -1324,7 +1372,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); vmx->emulation_required = 1; - vcpu->arch.rmode.active = 0; + vcpu->arch.rmode.vm86_active = 0; vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); @@ -1386,7 +1434,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); vmx->emulation_required = 1; - vcpu->arch.rmode.active = 1; + vcpu->arch.rmode.vm86_active = 1; vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); @@ -1485,7 +1533,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu) static void vmx_flush_tlb(struct kvm_vcpu *vcpu) { vpid_sync_vcpu_all(to_vmx(vcpu)); - if (vm_need_ept()) + if (enable_ept) ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); } @@ -1555,10 +1603,10 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx_fpu_deactivate(vcpu); - if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) + if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); - if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) + if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE)) enter_rmode(vcpu); #ifdef CONFIG_X86_64 @@ -1570,7 +1618,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif - if (vm_need_ept()) + if (enable_ept) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); vmcs_writel(CR0_READ_SHADOW, cr0); @@ -1599,7 +1647,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) u64 eptp; guest_cr3 = cr3; - if (vm_need_ept()) { + if (enable_ept) { eptp = construct_eptp(cr3); vmcs_write64(EPT_POINTER, eptp); ept_sync_context(eptp); @@ -1616,11 +1664,11 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ? + unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ? KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); vcpu->arch.cr4 = cr4; - if (vm_need_ept()) + if (enable_ept) ept_update_paging_mode_cr4(&hw_cr4, vcpu); vmcs_writel(CR4_READ_SHADOW, cr4); @@ -1699,7 +1747,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; - if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) { + if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) { vcpu->arch.rmode.tr.selector = var->selector; vcpu->arch.rmode.tr.base = var->base; vcpu->arch.rmode.tr.limit = var->limit; @@ -1709,7 +1757,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); - if (vcpu->arch.rmode.active && var->s) { + if (vcpu->arch.rmode.vm86_active && var->s) { /* * Hack real-mode segments into vm86 compatibility. */ @@ -1982,7 +2030,7 @@ static int init_rmode_identity_map(struct kvm *kvm) pfn_t identity_map_pfn; u32 tmp; - if (!vm_need_ept()) + if (!enable_ept) return 1; if (unlikely(!kvm->arch.ept_identity_pagetable)) { printk(KERN_ERR "EPT: identity-mapping pagetable " @@ -2071,7 +2119,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx) int vpid; vmx->vpid = 0; - if (!enable_vpid || !cpu_has_vmx_vpid()) + if (!enable_vpid) return; spin_lock(&vmx_vpid_lock); vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); @@ -2082,9 +2130,9 @@ static void allocate_vpid(struct vcpu_vmx *vmx) spin_unlock(&vmx_vpid_lock); } -static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) +static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) { - void *va; + int f = sizeof(unsigned long); if (!cpu_has_vmx_msr_bitmap()) return; @@ -2094,16 +2142,21 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) * have the write-low and read-high bitmap offsets the wrong way round. * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. */ - va = kmap(msr_bitmap); if (msr <= 0x1fff) { - __clear_bit(msr, va + 0x000); /* read-low */ - __clear_bit(msr, va + 0x800); /* write-low */ + __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ + __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { msr &= 0x1fff; - __clear_bit(msr, va + 0x400); /* read-high */ - __clear_bit(msr, va + 0xc00); /* write-high */ + __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ + __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ } - kunmap(msr_bitmap); +} + +static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) +{ + if (!longmode_only) + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); } /* @@ -2121,11 +2174,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) u32 exec_control; /* I/O */ - vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); - vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); + vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); + vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); if (cpu_has_vmx_msr_bitmap()) - vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap)); + vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ @@ -2141,7 +2194,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) CPU_BASED_CR8_LOAD_EXITING; #endif } - if (!vm_need_ept()) + if (!enable_ept) exec_control |= CPU_BASED_CR3_STORE_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_INVLPG_EXITING; @@ -2154,7 +2207,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; - if (!vm_need_ept()) + if (!enable_ept) exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -2273,7 +2326,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) goto out; } - vmx->vcpu.arch.rmode.active = 0; + vmx->vcpu.arch.rmode.vm86_active = 0; vmx->soft_vnmi_blocked = 0; @@ -2402,14 +2455,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); } -static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) +static void vmx_inject_irq(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + uint32_t intr; + int irq = vcpu->arch.interrupt.nr; KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); ++vcpu->stat.irq_injections; - if (vcpu->arch.rmode.active) { + if (vcpu->arch.rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = irq; vmx->rmode.irq.rip = kvm_rip_read(vcpu); @@ -2419,8 +2474,14 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); return; } - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); + intr = irq | INTR_INFO_VALID_MASK; + if (vcpu->arch.interrupt.soft) { + intr |= INTR_TYPE_SOFT_INTR; + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmx->vcpu.arch.event_exit_inst_len); + } else + intr |= INTR_TYPE_EXT_INTR; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); } static void vmx_inject_nmi(struct kvm_vcpu *vcpu) @@ -2441,7 +2502,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) } ++vcpu->stat.nmi_injections; - if (vcpu->arch.rmode.active) { + if (vcpu->arch.rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = NMI_VECTOR; vmx->rmode.irq.rip = kvm_rip_read(vcpu); @@ -2456,76 +2517,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); } -static void vmx_update_window_states(struct kvm_vcpu *vcpu) +static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) { - u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - - vcpu->arch.nmi_window_open = - !(guest_intr & (GUEST_INTR_STATE_STI | - GUEST_INTR_STATE_MOV_SS | - GUEST_INTR_STATE_NMI)); if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) - vcpu->arch.nmi_window_open = 0; - - vcpu->arch.interrupt_window_open = - ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - !(guest_intr & (GUEST_INTR_STATE_STI | - GUEST_INTR_STATE_MOV_SS))); -} - -static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) -{ - int word_index = __ffs(vcpu->arch.irq_summary); - int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); - int irq = word_index * BITS_PER_LONG + bit_index; + return 0; - clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); - if (!vcpu->arch.irq_pending[word_index]) - clear_bit(word_index, &vcpu->arch.irq_summary); - kvm_queue_interrupt(vcpu, irq); + return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS | + GUEST_INTR_STATE_NMI)); } -static void do_interrupt_requests(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { - vmx_update_window_states(vcpu); - - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_STI | - GUEST_INTR_STATE_MOV_SS); - - if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { - if (vcpu->arch.interrupt.pending) { - enable_nmi_window(vcpu); - } else if (vcpu->arch.nmi_window_open) { - vcpu->arch.nmi_pending = false; - vcpu->arch.nmi_injected = true; - } else { - enable_nmi_window(vcpu); - return; - } - } - if (vcpu->arch.nmi_injected) { - vmx_inject_nmi(vcpu); - if (vcpu->arch.nmi_pending) - enable_nmi_window(vcpu); - else if (vcpu->arch.irq_summary - || kvm_run->request_interrupt_window) - enable_irq_window(vcpu); - return; - } - - if (vcpu->arch.interrupt_window_open) { - if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) - kvm_do_inject_irq(vcpu); - - if (vcpu->arch.interrupt.pending) - vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); - } - if (!vcpu->arch.interrupt_window_open && - (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) - enable_irq_window(vcpu); + return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -2585,6 +2591,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, return 0; } +/* + * Trigger machine check on the host. We assume all the MSRs are already set up + * by the CPU and that we still run on the same CPU as the MCE occurred on. + * We pass a fake environment to the machine check handler because we want + * the guest to be always treated like user space, no matter what context + * it used internally. + */ +static void kvm_machine_check(void) +{ +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) + struct pt_regs regs = { + .cs = 3, /* Fake ring 3 no matter what the guest ran on */ + .flags = X86_EFLAGS_IF, + }; + + do_machine_check(®s, 0); +#endif +} + +static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + /* already handled by vcpu_run */ + return 1; +} + static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2596,17 +2627,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vect_info = vmx->idt_vectoring_info; intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + if (is_machine_check(intr_info)) + return handle_machine_check(vcpu, kvm_run); + if ((vect_info & VECTORING_INFO_VALID_MASK) && !is_page_fault(intr_info)) printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " "intr info 0x%x\n", __func__, vect_info, intr_info); - if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { - int irq = vect_info & VECTORING_INFO_VECTOR_MASK; - set_bit(irq, vcpu->arch.irq_pending); - set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); - } - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ @@ -2628,17 +2656,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { /* EPT won't cause page fault directly */ - if (vm_need_ept()) + if (enable_ept) BUG(); cr2 = vmcs_readl(EXIT_QUALIFICATION); KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, (u32)((u64)cr2 >> 32), handler); - if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) + if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); } - if (vcpu->arch.rmode.active && + if (vcpu->arch.rmode.vm86_active && handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, error_code)) { if (vcpu->arch.halt_request) { @@ -2753,13 +2781,18 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; - case 8: - kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg)); - skip_emulated_instruction(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) - return 1; - kvm_run->exit_reason = KVM_EXIT_SET_TPR; - return 0; + case 8: { + u8 cr8_prev = kvm_get_cr8(vcpu); + u8 cr8 = kvm_register_read(vcpu, reg); + kvm_set_cr8(vcpu, cr8); + skip_emulated_instruction(vcpu); + if (irqchip_in_kernel(vcpu->kvm)) + return 1; + if (cr8_prev <= cr8) + return 1; + kvm_run->exit_reason = KVM_EXIT_SET_TPR; + return 0; + } }; break; case 2: /* clts */ @@ -2957,8 +2990,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, * If the user space waits to inject interrupts, exit as soon as * possible */ - if (kvm_run->request_interrupt_window && - !vcpu->arch.irq_summary) { + if (!irqchip_in_kernel(vcpu->kvm) && + kvm_run->request_interrupt_window && + !kvm_cpu_has_interrupt(vcpu)) { kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; } @@ -2978,9 +3012,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); kvm_mmu_invlpg(vcpu, exit_qualification); skip_emulated_instruction(vcpu); @@ -2996,11 +3036,11 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u64 exit_qualification; + unsigned long exit_qualification; enum emulation_result er; unsigned long offset; - exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); offset = exit_qualification & 0xffful; er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); @@ -3019,22 +3059,41 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; u16 tss_selector; - int reason; + int reason, type, idt_v; + + idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); reason = (u32)exit_qualification >> 30; - if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected && - (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && - (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK) - == INTR_TYPE_NMI_INTR) { - vcpu->arch.nmi_injected = false; - if (cpu_has_virtual_nmis()) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); + if (reason == TASK_SWITCH_GATE && idt_v) { + switch (type) { + case INTR_TYPE_NMI_INTR: + vcpu->arch.nmi_injected = false; + if (cpu_has_virtual_nmis()) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + break; + case INTR_TYPE_EXT_INTR: + case INTR_TYPE_SOFT_INTR: + kvm_clear_interrupt_queue(vcpu); + break; + case INTR_TYPE_HARD_EXCEPTION: + case INTR_TYPE_SOFT_EXCEPTION: + kvm_clear_exception_queue(vcpu); + break; + default: + break; + } } tss_selector = exit_qualification; + if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && + type != INTR_TYPE_EXT_INTR && + type != INTR_TYPE_NMI_INTR)) + skip_emulated_instruction(vcpu); + if (!kvm_task_switch(vcpu, tss_selector, reason)) return 0; @@ -3051,11 +3110,11 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u64 exit_qualification; + unsigned long exit_qualification; gpa_t gpa; int gla_validity; - exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); if (exit_qualification & (1 << 6)) { printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); @@ -3067,7 +3126,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), - (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); + vmcs_readl(GUEST_LINEAR_ADDRESS)); printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", (long unsigned int)exit_qualification); kvm_run->exit_reason = KVM_EXIT_UNKNOWN; @@ -3098,8 +3157,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); enum emulation_result err = EMULATE_DONE; - preempt_enable(); local_irq_enable(); + preempt_enable(); while (!guest_state_valid(vcpu)) { err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); @@ -3109,7 +3168,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, if (err != EMULATE_DONE) { kvm_report_emulation_failure(vcpu, "emulation failure"); - return; + break; } if (signal_pending(current)) @@ -3118,8 +3177,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, schedule(); } - local_irq_disable(); preempt_disable(); + local_irq_disable(); vmx->invalid_state_emulation_result = err; } @@ -3145,11 +3204,21 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_HLT] = handle_halt, [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_VMCALL] = handle_vmcall, + [EXIT_REASON_VMCLEAR] = handle_vmx_insn, + [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, + [EXIT_REASON_VMPTRLD] = handle_vmx_insn, + [EXIT_REASON_VMPTRST] = handle_vmx_insn, + [EXIT_REASON_VMREAD] = handle_vmx_insn, + [EXIT_REASON_VMRESUME] = handle_vmx_insn, + [EXIT_REASON_VMWRITE] = handle_vmx_insn, + [EXIT_REASON_VMOFF] = handle_vmx_insn, + [EXIT_REASON_VMON] = handle_vmx_insn, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, + [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, }; static const int kvm_vmx_max_exit_handlers = @@ -3159,10 +3228,10 @@ static const int kvm_vmx_max_exit_handlers = * The guest has exited. See if we can fix it or if we need userspace * assistance. */ -static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { - u32 exit_reason = vmcs_read32(VM_EXIT_REASON); struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), @@ -3178,7 +3247,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ - if (vm_need_ept() && is_paging(vcpu)) { + if (enable_ept && is_paging(vcpu)) { vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); ept_load_pdptrs(vcpu); } @@ -3199,9 +3268,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) __func__, vectoring_info, exit_reason); if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { - if (vcpu->arch.interrupt_window_open) { + if (vmx_interrupt_allowed(vcpu)) { vmx->soft_vnmi_blocked = 0; - vcpu->arch.nmi_window_open = 1; } else if (vmx->vnmi_blocked_time > 1000000000LL && vcpu->arch.nmi_pending) { /* @@ -3214,7 +3282,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) "state on VCPU %d after 1 s timeout\n", __func__, vcpu->vcpu_id); vmx->soft_vnmi_blocked = 0; - vmx->vcpu.arch.nmi_window_open = 1; } } @@ -3228,122 +3295,107 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return 0; } -static void update_tpr_threshold(struct kvm_vcpu *vcpu) +static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { - int max_irr, tpr; - - if (!vm_need_tpr_shadow(vcpu->kvm)) - return; - - if (!kvm_lapic_enabled(vcpu) || - ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) { + if (irr == -1 || tpr < irr) { vmcs_write32(TPR_THRESHOLD, 0); return; } - tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4; - vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); + vmcs_write32(TPR_THRESHOLD, irr); } static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { u32 exit_intr_info; - u32 idt_vectoring_info; + u32 idt_vectoring_info = vmx->idt_vectoring_info; bool unblock_nmi; u8 vector; int type; bool idtv_info_valid; - u32 error; exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + + /* Handle machine checks before interrupts are enabled */ + if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) + || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI + && is_machine_check(exit_intr_info))) + kvm_machine_check(); + + /* We need to handle NMIs before interrupts are enabled */ + if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && + (exit_intr_info & INTR_INFO_VALID_MASK)) { + KVMTRACE_0D(NMI, &vmx->vcpu, handler); + asm("int $2"); + } + + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; + if (cpu_has_virtual_nmis()) { unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* - * SDM 3: 25.7.1.2 + * SDM 3: 27.7.1.2 (September 2008) * Re-set bit "block by NMI" before VM entry if vmexit caused by * a guest IRET fault. + * SDM 3: 23.2.2 (September 2008) + * Bit 12 is undefined in any of the following cases: + * If the VM exit sets the valid bit in the IDT-vectoring + * information field. + * If the VM exit is due to a double fault. */ - if (unblock_nmi && vector != DF_VECTOR) + if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && + vector != DF_VECTOR && !idtv_info_valid) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); } else if (unlikely(vmx->soft_vnmi_blocked)) vmx->vnmi_blocked_time += ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); - idt_vectoring_info = vmx->idt_vectoring_info; - idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; + vmx->vcpu.arch.nmi_injected = false; + kvm_clear_exception_queue(&vmx->vcpu); + kvm_clear_interrupt_queue(&vmx->vcpu); + + if (!idtv_info_valid) + return; + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; - if (vmx->vcpu.arch.nmi_injected) { + + switch (type) { + case INTR_TYPE_NMI_INTR: + vmx->vcpu.arch.nmi_injected = true; /* - * SDM 3: 25.7.1.2 - * Clear bit "block by NMI" before VM entry if a NMI delivery - * faulted. + * SDM 3: 27.7.1.2 (September 2008) + * Clear bit "block by NMI" before VM entry if a NMI + * delivery faulted. */ - if (idtv_info_valid && type == INTR_TYPE_NMI_INTR) - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmx->vcpu.arch.nmi_injected = false; - } - kvm_clear_exception_queue(&vmx->vcpu); - if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION || - type == INTR_TYPE_SOFT_EXCEPTION)) { + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + break; + case INTR_TYPE_SOFT_EXCEPTION: + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + /* fall through */ + case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - error = vmcs_read32(IDT_VECTORING_ERROR_CODE); - kvm_queue_exception_e(&vmx->vcpu, vector, error); + u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE); + kvm_queue_exception_e(&vmx->vcpu, vector, err); } else kvm_queue_exception(&vmx->vcpu, vector); - vmx->idt_vectoring_info = 0; - } - kvm_clear_interrupt_queue(&vmx->vcpu); - if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { - kvm_queue_interrupt(&vmx->vcpu, vector); - vmx->idt_vectoring_info = 0; - } -} - -static void vmx_intr_assist(struct kvm_vcpu *vcpu) -{ - update_tpr_threshold(vcpu); - - vmx_update_window_states(vcpu); - - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_STI | - GUEST_INTR_STATE_MOV_SS); - - if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { - if (vcpu->arch.interrupt.pending) { - enable_nmi_window(vcpu); - } else if (vcpu->arch.nmi_window_open) { - vcpu->arch.nmi_pending = false; - vcpu->arch.nmi_injected = true; - } else { - enable_nmi_window(vcpu); - return; - } - } - if (vcpu->arch.nmi_injected) { - vmx_inject_nmi(vcpu); - if (vcpu->arch.nmi_pending) - enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu)) - enable_irq_window(vcpu); - return; - } - if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { - if (vcpu->arch.interrupt_window_open) - kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); - else - enable_irq_window(vcpu); - } - if (vcpu->arch.interrupt.pending) { - vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); - if (kvm_cpu_has_interrupt(vcpu)) - enable_irq_window(vcpu); + break; + case INTR_TYPE_SOFT_INTR: + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + /* fall through */ + case INTR_TYPE_EXT_INTR: + kvm_queue_interrupt(&vmx->vcpu, vector, + type == INTR_TYPE_SOFT_INTR); + break; + default: + break; } } @@ -3381,7 +3433,6 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 intr_info; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) @@ -3505,20 +3556,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); - vmx_update_window_states(vcpu); - asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; - intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - /* We need to handle NMIs before interrupts are enabled */ - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && - (intr_info & INTR_INFO_VALID_MASK)) { - KVMTRACE_0D(NMI, vcpu, handler); - asm("int $2"); - } - vmx_complete_interrupts(vmx); } @@ -3593,7 +3633,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (alloc_apic_access_page(kvm) != 0) goto free_vmcs; - if (vm_need_ept()) + if (enable_ept) if (alloc_identity_pagetable(kvm) != 0) goto free_vmcs; @@ -3631,9 +3671,32 @@ static int get_ept_level(void) return VMX_EPT_DEFAULT_GAW + 1; } -static int vmx_get_mt_mask_shift(void) +static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { - return VMX_EPT_MT_EPTE_SHIFT; + u64 ret; + + /* For VT-d and EPT combination + * 1. MMIO: always map as UC + * 2. EPT with VT-d: + * a. VT-d without snooping control feature: can't guarantee the + * result, try to trust guest. + * b. VT-d with snooping control feature: snooping control feature of + * VT-d engine can guarantee the cache correctness. Just set it + * to WB to keep consistent with host. So the same as item 3. + * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep + * consistent with host MTRR + */ + if (is_mmio) + ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; + else if (vcpu->kvm->arch.iommu_domain && + !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)) + ret = kvm_get_guest_memory_type(vcpu, gfn) << + VMX_EPT_MT_EPTE_SHIFT; + else + ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) + | VMX_EPT_IGMT_BIT; + + return ret; } static struct kvm_x86_ops vmx_x86_ops = { @@ -3644,7 +3707,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_processor_compatibility = vmx_check_processor_compat, .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, - .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses, + .cpu_has_accelerated_tpr = report_flexpriority, .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, @@ -3678,78 +3741,82 @@ static struct kvm_x86_ops vmx_x86_ops = { .tlb_flush = vmx_flush_tlb, .run = vmx_vcpu_run, - .handle_exit = kvm_handle_exit, + .handle_exit = vmx_handle_exit, .skip_emulated_instruction = skip_emulated_instruction, + .set_interrupt_shadow = vmx_set_interrupt_shadow, + .get_interrupt_shadow = vmx_get_interrupt_shadow, .patch_hypercall = vmx_patch_hypercall, - .get_irq = vmx_get_irq, .set_irq = vmx_inject_irq, + .set_nmi = vmx_inject_nmi, .queue_exception = vmx_queue_exception, - .exception_injected = vmx_exception_injected, - .inject_pending_irq = vmx_intr_assist, - .inject_pending_vectors = do_interrupt_requests, + .interrupt_allowed = vmx_interrupt_allowed, + .nmi_allowed = vmx_nmi_allowed, + .enable_nmi_window = enable_nmi_window, + .enable_irq_window = enable_irq_window, + .update_cr8_intercept = update_cr8_intercept, .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, - .get_mt_mask_shift = vmx_get_mt_mask_shift, + .get_mt_mask = vmx_get_mt_mask, }; static int __init vmx_init(void) { - void *va; int r; - vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_io_bitmap_a) return -ENOMEM; - vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_io_bitmap_b) { r = -ENOMEM; goto out; } - vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!vmx_msr_bitmap) { + vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy) { r = -ENOMEM; goto out1; } + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode) { + r = -ENOMEM; + goto out2; + } + /* * Allow direct access to the PC debug port (it is often used for I/O * delays, but the vmexits simply slow things down). */ - va = kmap(vmx_io_bitmap_a); - memset(va, 0xff, PAGE_SIZE); - clear_bit(0x80, va); - kunmap(vmx_io_bitmap_a); + memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); + clear_bit(0x80, vmx_io_bitmap_a); - va = kmap(vmx_io_bitmap_b); - memset(va, 0xff, PAGE_SIZE); - kunmap(vmx_io_bitmap_b); + memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); - va = kmap(vmx_msr_bitmap); - memset(va, 0xff, PAGE_SIZE); - kunmap(vmx_msr_bitmap); + memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); + memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); if (r) - goto out2; + goto out3; - vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE); - vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE); - vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS); - vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); - vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); + vmx_disable_intercept_for_msr(MSR_GS_BASE, false); + vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - if (vm_need_ept()) { + if (enable_ept) { bypass_guest_pf = 0; kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK); kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, - VMX_EPT_EXECUTABLE_MASK, - VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); + VMX_EPT_EXECUTABLE_MASK); kvm_enable_tdp(); } else kvm_disable_tdp(); @@ -3761,20 +3828,23 @@ static int __init vmx_init(void) return 0; +out3: + free_page((unsigned long)vmx_msr_bitmap_longmode); out2: - __free_page(vmx_msr_bitmap); + free_page((unsigned long)vmx_msr_bitmap_legacy); out1: - __free_page(vmx_io_bitmap_b); + free_page((unsigned long)vmx_io_bitmap_b); out: - __free_page(vmx_io_bitmap_a); + free_page((unsigned long)vmx_io_bitmap_a); return r; } static void __exit vmx_exit(void) { - __free_page(vmx_msr_bitmap); - __free_page(vmx_io_bitmap_b); - __free_page(vmx_io_bitmap_a); + free_page((unsigned long)vmx_msr_bitmap_legacy); + free_page((unsigned long)vmx_msr_bitmap_longmode); + free_page((unsigned long)vmx_io_bitmap_b); + free_page((unsigned long)vmx_io_bitmap_a); kvm_exit(); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3944e917e79..3d452901182 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -91,7 +91,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, { "request_irq", VCPU_STAT(request_irq_exits) }, - { "request_nmi", VCPU_STAT(request_nmi_exits) }, { "irq_exits", VCPU_STAT(irq_exits) }, { "host_state_reload", VCPU_STAT(host_state_reload) }, { "efer_reload", VCPU_STAT(efer_reload) }, @@ -108,7 +107,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_unsync", VM_STAT(mmu_unsync) }, - { "mmu_unsync_global", VM_STAT(mmu_unsync_global) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, { NULL } @@ -234,7 +232,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) goto out; } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { - if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { + if (is_present_pte(pdpte[i]) && + (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { ret = 0; goto out; } @@ -321,7 +320,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); vcpu->arch.cr0 = cr0; - kvm_mmu_sync_global(vcpu); kvm_mmu_reset_context(vcpu); return; } @@ -370,7 +368,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; - kvm_mmu_sync_global(vcpu); kvm_mmu_reset_context(vcpu); } EXPORT_SYMBOL_GPL(kvm_set_cr4); @@ -523,6 +520,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) efer |= vcpu->arch.shadow_efer & EFER_LMA; vcpu->arch.shadow_efer = efer; + + vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; + kvm_mmu_reset_context(vcpu); } void kvm_enable_efer_bits(u64 mask) @@ -630,14 +630,17 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) unsigned long flags; struct kvm_vcpu_arch *vcpu = &v->arch; void *shared_kaddr; + unsigned long this_tsc_khz; if ((!vcpu->time_page)) return; - if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) { - kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock); - vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz); + this_tsc_khz = get_cpu_var(cpu_tsc_khz); + if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { + kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); + vcpu->hv_clock_tsc_khz = this_tsc_khz; } + put_cpu_var(cpu_tsc_khz); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); @@ -701,11 +704,48 @@ static bool msr_mtrr_valid(unsigned msr) return false; } +static bool valid_pat_type(unsigned t) +{ + return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ +} + +static bool valid_mtrr_type(unsigned t) +{ + return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ +} + +static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + int i; + + if (!msr_mtrr_valid(msr)) + return false; + + if (msr == MSR_IA32_CR_PAT) { + for (i = 0; i < 8; i++) + if (!valid_pat_type((data >> (i * 8)) & 0xff)) + return false; + return true; + } else if (msr == MSR_MTRRdefType) { + if (data & ~0xcff) + return false; + return valid_mtrr_type(data & 0xff); + } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { + for (i = 0; i < 8 ; i++) + if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) + return false; + return true; + } + + /* variable MTRRs */ + return valid_mtrr_type(data & 0xff); +} + static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; - if (!msr_mtrr_valid(msr)) + if (!mtrr_valid(vcpu, msr, data)) return 1; if (msr == MSR_MTRRdefType) { @@ -893,6 +933,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_LASTINTFROMIP: case MSR_IA32_LASTINTTOIP: case MSR_VM_HSAVE_PA: + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + case MSR_K7_EVNTSEL0: data = 0; break; case MSR_MTRRcap: @@ -1024,6 +1067,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_SYNC_MMU: case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: + case KVM_CAP_ASSIGN_DEV_IRQ: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1072,14 +1116,13 @@ long kvm_arch_dev_ioctl(struct file *filp, if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) goto out; r = -E2BIG; - if (n < num_msrs_to_save) + if (n < msr_list.nmsrs) goto out; r = -EFAULT; if (copy_to_user(user_msr_list->indices, &msrs_to_save, num_msrs_to_save * sizeof(u32))) goto out; - if (copy_to_user(user_msr_list->indices - + num_msrs_to_save * sizeof(u32), + if (copy_to_user(user_msr_list->indices + num_msrs_to_save, &emulated_msrs, ARRAY_SIZE(emulated_msrs) * sizeof(u32))) goto out; @@ -1241,41 +1284,53 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags = 0; } +#define F(x) bit(X86_FEATURE_##x) + static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index, int *nent, int maxnent) { - const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | - bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | - bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | - bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | - bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | - bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | - bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | - bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | - bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | - bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); - const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | - bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | - bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | - bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | - bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | - bit(X86_FEATURE_PGE) | - bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | - bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | - bit(X86_FEATURE_SYSCALL) | - (is_efer_nx() ? bit(X86_FEATURE_NX) : 0) | + unsigned f_nx = is_efer_nx() ? F(NX) : 0; #ifdef CONFIG_X86_64 - bit(X86_FEATURE_LM) | + unsigned f_lm = F(LM); +#else + unsigned f_lm = 0; #endif - bit(X86_FEATURE_FXSR_OPT) | - bit(X86_FEATURE_MMXEXT) | - bit(X86_FEATURE_3DNOWEXT) | - bit(X86_FEATURE_3DNOW); - const u32 kvm_supported_word3_x86_features = - bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); + + /* cpuid 1.edx */ + const u32 kvm_supported_word0_x86_features = + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | + 0 /* Reserved, DS, ACPI */ | F(MMX) | + F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | + 0 /* HTT, TM, Reserved, PBE */; + /* cpuid 0x80000001.edx */ + const u32 kvm_supported_word1_x86_features = + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* Reserved */ | + f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | + F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ | + 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); + /* cpuid 1.ecx */ + const u32 kvm_supported_word4_x86_features = + F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | + 0 /* DS-CPL, VMX, SMX, EST */ | + 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | + 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | + 0 /* Reserved, DCA */ | F(XMM4_1) | + F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) | + 0 /* Reserved, XSAVE, OSXSAVE */; + /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = - bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) | - bit(X86_FEATURE_SVM); + F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | + F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | + F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | + 0 /* SKINIT */ | 0 /* WDT */; /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -1288,7 +1343,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; case 1: entry->edx &= kvm_supported_word0_x86_features; - entry->ecx &= kvm_supported_word3_x86_features; + entry->ecx &= kvm_supported_word4_x86_features; break; /* function 2 entries are STATEFUL. That is, repeated cpuid commands * may return different values. This forces us to get_cpu() before @@ -1350,6 +1405,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, put_cpu(); } +#undef F + static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { @@ -1421,8 +1478,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -ENXIO; vcpu_load(vcpu); - set_bit(irq->irq, vcpu->arch.irq_pending); - set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); + kvm_queue_interrupt(vcpu, irq->irq, false); vcpu_put(vcpu); @@ -1584,8 +1640,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; } out: - if (lapic) - kfree(lapic); + kfree(lapic); return r; } @@ -1606,10 +1661,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, return -EINVAL; down_write(&kvm->slots_lock); + spin_lock(&kvm->mmu_lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; + spin_unlock(&kvm->mmu_lock); up_write(&kvm->slots_lock); return 0; } @@ -1785,7 +1842,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { + spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); + spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); memslot = &kvm->memslots[log->slot]; n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; @@ -2360,7 +2419,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, u16 error_code, int emulation_type) { - int r; + int r, shadow_mask; struct decode_cache *c; kvm_clear_exception_queue(vcpu); @@ -2408,7 +2467,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } } + if (emulation_type & EMULTYPE_SKIP) { + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); + return EMULATE_DONE; + } + r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; + + if (r == 0) + kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); if (vcpu->arch.pio.string) return EMULATE_DO_MMIO; @@ -2761,7 +2829,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_nonpresent_ptes(0ull, 0ull); kvm_mmu_set_base_ptes(PT_PRESENT_MASK); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); + PT_DIRTY_MASK, PT64_NX_MASK, 0); for_each_possible_cpu(cpu) per_cpu(cpu_tsc_khz, cpu) = tsc_khz; @@ -3012,6 +3080,16 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, return best; } +int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + if (best) + return best->eax & 0xff; + return 36; +} + void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { u32 function, index; @@ -3048,10 +3126,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - return (!vcpu->arch.irq_summary && + return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && kvm_run->request_interrupt_window && - vcpu->arch.interrupt_window_open && - (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); + kvm_arch_interrupt_allowed(vcpu)); } static void post_kvm_run_save(struct kvm_vcpu *vcpu, @@ -3064,8 +3141,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu, kvm_run->ready_for_interrupt_injection = 1; else kvm_run->ready_for_interrupt_injection = - (vcpu->arch.interrupt_window_open && - vcpu->arch.irq_summary == 0); + kvm_arch_interrupt_allowed(vcpu) && + !kvm_cpu_has_interrupt(vcpu) && + !kvm_event_needs_reinjection(vcpu); } static void vapic_enter(struct kvm_vcpu *vcpu) @@ -3094,9 +3172,63 @@ static void vapic_exit(struct kvm_vcpu *vcpu) up_read(&vcpu->kvm->slots_lock); } +static void update_cr8_intercept(struct kvm_vcpu *vcpu) +{ + int max_irr, tpr; + + if (!kvm_x86_ops->update_cr8_intercept) + return; + + if (!vcpu->arch.apic->vapic_addr) + max_irr = kvm_lapic_find_highest_irr(vcpu); + else + max_irr = -1; + + if (max_irr != -1) + max_irr >>= 4; + + tpr = kvm_lapic_get_cr8(vcpu); + + kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); +} + +static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + kvm_x86_ops->set_interrupt_shadow(vcpu, 0); + + /* try to reinject previous events if any */ + if (vcpu->arch.nmi_injected) { + kvm_x86_ops->set_nmi(vcpu); + return; + } + + if (vcpu->arch.interrupt.pending) { + kvm_x86_ops->set_irq(vcpu); + return; + } + + /* try to inject new event if pending */ + if (vcpu->arch.nmi_pending) { + if (kvm_x86_ops->nmi_allowed(vcpu)) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + kvm_x86_ops->set_nmi(vcpu); + } + } else if (kvm_cpu_has_interrupt(vcpu)) { + if (kvm_x86_ops->interrupt_allowed(vcpu)) { + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), + false); + kvm_x86_ops->set_irq(vcpu); + } + } +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; + bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && + kvm_run->request_interrupt_window; if (vcpu->requests) if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) @@ -3128,9 +3260,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } - clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); - kvm_inject_pending_timer_irqs(vcpu); - preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); @@ -3138,6 +3267,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) local_irq_disable(); + clear_bit(KVM_REQ_KICK, &vcpu->requests); + smp_mb__after_clear_bit(); + if (vcpu->requests || need_resched() || signal_pending(current)) { local_irq_enable(); preempt_enable(); @@ -3145,21 +3277,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) goto out; } - vcpu->guest_mode = 1; - /* - * Make sure that guest_mode assignment won't happen after - * testing the pending IRQ vector bitmap. - */ - smp_wmb(); - if (vcpu->arch.exception.pending) __queue_exception(vcpu); - else if (irqchip_in_kernel(vcpu->kvm)) - kvm_x86_ops->inject_pending_irq(vcpu); else - kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); + inject_pending_irq(vcpu, kvm_run); - kvm_lapic_sync_to_vapic(vcpu); + /* enable NMI/IRQ window open exits if needed */ + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); + + if (kvm_lapic_enabled(vcpu)) { + update_cr8_intercept(vcpu); + kvm_lapic_sync_to_vapic(vcpu); + } up_read(&vcpu->kvm->slots_lock); @@ -3193,7 +3325,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) set_debugreg(vcpu->arch.host_dr6, 6); set_debugreg(vcpu->arch.host_dr7, 7); - vcpu->guest_mode = 0; + set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); ++vcpu->stat.exits; @@ -3220,8 +3352,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) profile_hit(KVM_PROFILING, (void *)rip); } - if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) - vcpu->arch.exception.pending = false; kvm_lapic_sync_from_vapic(vcpu); @@ -3230,6 +3360,7 @@ out: return r; } + static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -3256,29 +3387,42 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_vcpu_block(vcpu); down_read(&vcpu->kvm->slots_lock); if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) - if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + { + switch(vcpu->arch.mp_state) { + case KVM_MP_STATE_HALTED: vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; - if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) - r = -EINTR; + KVM_MP_STATE_RUNNABLE; + case KVM_MP_STATE_RUNNABLE: + break; + case KVM_MP_STATE_SIPI_RECEIVED: + default: + r = -EINTR; + break; + } + } } - if (r > 0) { - if (dm_request_for_irq_injection(vcpu, kvm_run)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.request_irq_exits; - } - if (signal_pending(current)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - } - if (need_resched()) { - up_read(&vcpu->kvm->slots_lock); - kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); - } + if (r <= 0) + break; + + clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + if (kvm_cpu_has_pending_timer(vcpu)) + kvm_inject_pending_timer_irqs(vcpu); + + if (dm_request_for_irq_injection(vcpu, kvm_run)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.request_irq_exits; + } + if (signal_pending(current)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + } + if (need_resched()) { + up_read(&vcpu->kvm->slots_lock); + kvm_resched(vcpu); + down_read(&vcpu->kvm->slots_lock); } } @@ -3442,7 +3586,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct descriptor_table dt; - int pending_vec; vcpu_load(vcpu); @@ -3472,16 +3615,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->efer = vcpu->arch.shadow_efer; sregs->apic_base = kvm_get_apic_base(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) { - memset(sregs->interrupt_bitmap, 0, - sizeof sregs->interrupt_bitmap); - pending_vec = kvm_x86_ops->get_irq(vcpu); - if (pending_vec >= 0) - set_bit(pending_vec, - (unsigned long *)sregs->interrupt_bitmap); - } else - memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, - sizeof sregs->interrupt_bitmap); + memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); + + if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) + set_bit(vcpu->arch.interrupt.nr, + (unsigned long *)sregs->interrupt_bitmap); vcpu_put(vcpu); @@ -3688,7 +3826,6 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); - tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); } static int load_state_from_tss32(struct kvm_vcpu *vcpu, @@ -3785,8 +3922,8 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu, } static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, - u32 old_tss_base, - struct desc_struct *nseg_desc) + u16 old_tss_sel, u32 old_tss_base, + struct desc_struct *nseg_desc) { struct tss_segment_16 tss_segment_16; int ret = 0; @@ -3805,6 +3942,16 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, &tss_segment_16, sizeof tss_segment_16)) goto out; + if (old_tss_sel != 0xffff) { + tss_segment_16.prev_task_link = old_tss_sel; + + if (kvm_write_guest(vcpu->kvm, + get_tss_base_addr(vcpu, nseg_desc), + &tss_segment_16.prev_task_link, + sizeof tss_segment_16.prev_task_link)) + goto out; + } + if (load_state_from_tss16(vcpu, &tss_segment_16)) goto out; @@ -3814,7 +3961,7 @@ out: } static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, - u32 old_tss_base, + u16 old_tss_sel, u32 old_tss_base, struct desc_struct *nseg_desc) { struct tss_segment_32 tss_segment_32; @@ -3834,6 +3981,16 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, &tss_segment_32, sizeof tss_segment_32)) goto out; + if (old_tss_sel != 0xffff) { + tss_segment_32.prev_task_link = old_tss_sel; + + if (kvm_write_guest(vcpu->kvm, + get_tss_base_addr(vcpu, nseg_desc), + &tss_segment_32.prev_task_link, + sizeof tss_segment_32.prev_task_link)) + goto out; + } + if (load_state_from_tss32(vcpu, &tss_segment_32)) goto out; @@ -3887,14 +4044,22 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); } - kvm_x86_ops->skip_emulated_instruction(vcpu); + /* set back link to prev task only if NT bit is set in eflags + note that old_tss_sel is not used afetr this point */ + if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) + old_tss_sel = 0xffff; + + /* set back link to prev task only if NT bit is set in eflags + note that old_tss_sel is not used afetr this point */ + if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) + old_tss_sel = 0xffff; if (nseg_desc.type & 8) - ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, - &nseg_desc); + ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, + old_tss_base, &nseg_desc); else - ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base, - &nseg_desc); + ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, + old_tss_base, &nseg_desc); if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { u32 eflags = kvm_x86_ops->get_rflags(vcpu); @@ -3920,7 +4085,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { int mmu_reset_needed = 0; - int i, pending_vec, max_bits; + int pending_vec, max_bits; struct descriptor_table dt; vcpu_load(vcpu); @@ -3934,7 +4099,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, vcpu->arch.cr2 = sregs->cr2; mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; - vcpu->arch.cr3 = sregs->cr3; + + down_read(&vcpu->kvm->slots_lock); + if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT)) + vcpu->arch.cr3 = sregs->cr3; + else + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + up_read(&vcpu->kvm->slots_lock); kvm_set_cr8(vcpu, sregs->cr8); @@ -3956,25 +4127,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); - if (!irqchip_in_kernel(vcpu->kvm)) { - memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, - sizeof vcpu->arch.irq_pending); - vcpu->arch.irq_summary = 0; - for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) - if (vcpu->arch.irq_pending[i]) - __set_bit(i, &vcpu->arch.irq_summary); - } else { - max_bits = (sizeof sregs->interrupt_bitmap) << 3; - pending_vec = find_first_bit( - (const unsigned long *)sregs->interrupt_bitmap, - max_bits); - /* Only pending external irq is handled here */ - if (pending_vec < max_bits) { - kvm_x86_ops->set_irq(vcpu, pending_vec); - pr_debug("Set back pending irq %d\n", - pending_vec); - } - kvm_pic_clear_isr_ack(vcpu->kvm); + max_bits = (sizeof sregs->interrupt_bitmap) << 3; + pending_vec = find_first_bit( + (const unsigned long *)sregs->interrupt_bitmap, max_bits); + if (pending_vec < max_bits) { + kvm_queue_interrupt(vcpu, pending_vec, false); + pr_debug("Set back pending irq %d\n", pending_vec); + if (irqchip_in_kernel(vcpu->kvm)) + kvm_pic_clear_isr_ack(vcpu->kvm); } kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); @@ -4308,7 +4468,6 @@ struct kvm *kvm_arch_create_vm(void) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.oos_global_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ @@ -4411,12 +4570,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm, } } + spin_lock(&kvm->mmu_lock); if (!kvm->arch.n_requested_mmu_pages) { unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); } kvm_mmu_slot_remove_write_access(kvm, mem->slot); + spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); return 0; @@ -4425,6 +4586,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, void kvm_arch_flush_shadow(struct kvm *kvm) { kvm_mmu_zap_all(kvm); + kvm_reload_remote_mmus(kvm); } int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) @@ -4434,28 +4596,24 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) || vcpu->arch.nmi_pending; } -static void vcpu_kick_intr(void *info) -{ -#ifdef DEBUG - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; - printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); -#endif -} - void kvm_vcpu_kick(struct kvm_vcpu *vcpu) { - int ipi_pcpu = vcpu->cpu; - int cpu = get_cpu(); + int me; + int cpu = vcpu->cpu; if (waitqueue_active(&vcpu->wq)) { wake_up_interruptible(&vcpu->wq); ++vcpu->stat.halt_wakeup; } - /* - * We may be called synchronously with irqs disabled in guest mode, - * So need not to call smp_call_function_single() in that case. - */ - if (vcpu->guest_mode && vcpu->cpu != cpu) - smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0); + + me = get_cpu(); + if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) + if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) + smp_send_reschedule(cpu); put_cpu(); } + +int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + return kvm_x86_ops->interrupt_allowed(vcpu); +} diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 6a4be78a738..4c8e10af78e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,9 +8,11 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) vcpu->arch.exception.pending = false; } -static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector) +static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector, + bool soft) { vcpu->arch.interrupt.pending = true; + vcpu->arch.interrupt.soft = soft; vcpu->arch.interrupt.nr = vector; } @@ -19,4 +21,14 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) vcpu->arch.interrupt.pending = false; } +static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending || + vcpu->arch.nmi_injected; +} + +static inline bool kvm_exception_is_soft(unsigned int nr) +{ + return (nr == BP_VECTOR) || (nr == OF_VECTOR); +} #endif diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index ca91749d208..616de4628d6 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -59,13 +59,14 @@ #define SrcImm (5<<4) /* Immediate operand. */ #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ #define SrcOne (7<<4) /* Implied '1' */ -#define SrcMask (7<<4) +#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ +#define SrcMask (0xf<<4) /* Generic ModRM decode. */ -#define ModRM (1<<7) +#define ModRM (1<<8) /* Destination is only written; never read. */ -#define Mov (1<<8) -#define BitOp (1<<9) -#define MemAbs (1<<10) /* Memory operand is absolute displacement */ +#define Mov (1<<9) +#define BitOp (1<<10) +#define MemAbs (1<<11) /* Memory operand is absolute displacement */ #define String (1<<12) /* String instruction (rep capable) */ #define Stack (1<<13) /* Stack instruction (push/pop) */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ @@ -76,6 +77,7 @@ #define Src2CL (1<<29) #define Src2ImmByte (2<<29) #define Src2One (3<<29) +#define Src2Imm16 (4<<29) #define Src2Mask (7<<29) enum { @@ -135,11 +137,11 @@ static u32 opcode_table[256] = { SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ /* 0x70 - 0x77 */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, /* 0x78 - 0x7F */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, /* 0x80 - 0x87 */ Group | Group1_80, Group | Group1_81, Group | Group1_82, Group | Group1_83, @@ -153,7 +155,8 @@ static u32 opcode_table[256] = { /* 0x90 - 0x97 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x98 - 0x9F */ - 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, + 0, 0, SrcImm | Src2Imm16, 0, + ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, /* 0xA0 - 0xA7 */ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, @@ -178,7 +181,8 @@ static u32 opcode_table[256] = { 0, ImplicitOps | Stack, 0, 0, ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, /* 0xC8 - 0xCF */ - 0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0, + 0, 0, 0, ImplicitOps | Stack, + ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, /* 0xD0 - 0xD7 */ ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, @@ -187,11 +191,11 @@ static u32 opcode_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xE7 */ 0, 0, 0, 0, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + ByteOp | SrcImmUByte, SrcImmUByte, + ByteOp | SrcImmUByte, SrcImmUByte, /* 0xE8 - 0xEF */ - ImplicitOps | Stack, SrcImm | ImplicitOps, - ImplicitOps, SrcImmByte | ImplicitOps, + SrcImm | Stack, SrcImm | ImplicitOps, + SrcImm | Src2Imm16, SrcImmByte | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ @@ -230,10 +234,8 @@ static u32 twobyte_table[256] = { /* 0x70 - 0x7F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x8F */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, + SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xA7 */ @@ -1044,10 +1046,14 @@ done_prefixes: } break; case SrcImmByte: + case SrcImmUByte: c->src.type = OP_IMM; c->src.ptr = (unsigned long *)c->eip; c->src.bytes = 1; - c->src.val = insn_fetch(s8, 1, c->eip); + if ((c->d & SrcMask) == SrcImmByte) + c->src.val = insn_fetch(s8, 1, c->eip); + else + c->src.val = insn_fetch(u8, 1, c->eip); break; case SrcOne: c->src.bytes = 1; @@ -1072,6 +1078,12 @@ done_prefixes: c->src2.bytes = 1; c->src2.val = insn_fetch(u8, 1, c->eip); break; + case Src2Imm16: + c->src2.type = OP_IMM; + c->src2.ptr = (unsigned long *)c->eip; + c->src2.bytes = 2; + c->src2.val = insn_fetch(u16, 2, c->eip); + break; case Src2One: c->src2.bytes = 1; c->src2.val = 1; @@ -1349,6 +1361,20 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, return 0; } +static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) +{ + u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); + /* + * an sti; sti; sequence only disable interrupts for the first + * instruction. So, if the last instruction, be it emulated or + * not, left the system with the INT_STI flag enabled, it + * means that the last instruction is an sti. We should not + * leave the flag on in this case. The same goes for mov ss + */ + if (!(int_shadow & mask)) + ctxt->interruptibility = mask; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1360,6 +1386,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) int io_dir_in; int rc = 0; + ctxt->interruptibility = 0; + /* Shadow copy of register state. Committed on successful emulation. * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't * modify them. @@ -1531,13 +1559,10 @@ special_insn: return -1; } return 0; - case 0x70 ... 0x7f: /* jcc (short) */ { - int rel = insn_fetch(s8, 1, c->eip); - + case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(c->b, ctxt->eflags)) - jmp_rel(c, rel); + jmp_rel(c, c->src.val); break; - } case 0x80 ... 0x83: /* Grp1 */ switch (c->modrm_reg) { case 0: @@ -1609,6 +1634,9 @@ special_insn: int err; sel = c->src.val; + if (c->modrm_reg == VCPU_SREG_SS) + toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); + if (c->modrm_reg <= 5) { type_bits = (c->modrm_reg == 1) ? 9 : 1; err = kvm_load_segment_descriptor(ctxt->vcpu, sel, @@ -1769,59 +1797,32 @@ special_insn: break; case 0xe4: /* inb */ case 0xe5: /* in */ - port = insn_fetch(u8, 1, c->eip); + port = c->src.val; io_dir_in = 1; goto do_io; case 0xe6: /* outb */ case 0xe7: /* out */ - port = insn_fetch(u8, 1, c->eip); + port = c->src.val; io_dir_in = 0; goto do_io; case 0xe8: /* call (near) */ { - long int rel; - switch (c->op_bytes) { - case 2: - rel = insn_fetch(s16, 2, c->eip); - break; - case 4: - rel = insn_fetch(s32, 4, c->eip); - break; - default: - DPRINTF("Call: Invalid op_bytes\n"); - goto cannot_emulate; - } + long int rel = c->src.val; c->src.val = (unsigned long) c->eip; jmp_rel(c, rel); - c->op_bytes = c->ad_bytes; emulate_push(ctxt); break; } case 0xe9: /* jmp rel */ goto jmp; - case 0xea: /* jmp far */ { - uint32_t eip; - uint16_t sel; - - switch (c->op_bytes) { - case 2: - eip = insn_fetch(u16, 2, c->eip); - break; - case 4: - eip = insn_fetch(u32, 4, c->eip); - break; - default: - DPRINTF("jmp far: Invalid op_bytes\n"); - goto cannot_emulate; - } - sel = insn_fetch(u16, 2, c->eip); - if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) { + case 0xea: /* jmp far */ + if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, + VCPU_SREG_CS) < 0) { DPRINTF("jmp far: Failed to load CS descriptor\n"); goto cannot_emulate; } - c->eip = eip; + c->eip = c->src.val; break; - } case 0xeb: jmp: /* jmp rel short */ jmp_rel(c, c->src.val); @@ -1865,6 +1866,7 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfb: /* sti */ + toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); ctxt->eflags |= X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ break; @@ -2039,28 +2041,11 @@ twobyte_insn: if (!test_cc(c->b, ctxt->eflags)) c->dst.type = OP_NONE; /* no writeback */ break; - case 0x80 ... 0x8f: /* jnz rel, etc*/ { - long int rel; - - switch (c->op_bytes) { - case 2: - rel = insn_fetch(s16, 2, c->eip); - break; - case 4: - rel = insn_fetch(s32, 4, c->eip); - break; - case 8: - rel = insn_fetch(s64, 8, c->eip); - break; - default: - DPRINTF("jnz: Invalid op_bytes\n"); - goto cannot_emulate; - } + case 0x80 ... 0x8f: /* jnz rel, etc*/ if (test_cc(c->b, ctxt->eflags)) - jmp_rel(c, rel); + jmp_rel(c, c->src.val); c->dst.type = OP_NONE; break; - } case 0xa3: bt: /* bt */ c->dst.type = OP_NONE; diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 8dab8f7844d..38718041efc 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig @@ -2,7 +2,6 @@ config LGUEST_GUEST bool "Lguest guest support" select PARAVIRT depends on X86_32 - depends on !X86_PAE select VIRTIO select VIRTIO_RING select VIRTIO_CONSOLE diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 33a93b41739..d677fa9ca65 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -22,7 +22,8 @@ * * So how does the kernel know it's a Guest? We'll see that later, but let's * just say that we end up here where we replace the native functions various - * "paravirt" structures with our Guest versions, then boot like normal. :*/ + * "paravirt" structures with our Guest versions, then boot like normal. +:*/ /* * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. @@ -74,7 +75,8 @@ * * The Guest in our tale is a simple creature: identical to the Host but * behaving in simplified but equivalent ways. In particular, the Guest is the - * same kernel as the Host (or at least, built from the same source code). :*/ + * same kernel as the Host (or at least, built from the same source code). +:*/ struct lguest_data lguest_data = { .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, @@ -85,35 +87,41 @@ struct lguest_data lguest_data = { .syscall_vec = SYSCALL_VECTOR, }; -/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a +/*G:037 + * async_hcall() is pretty simple: I'm quite proud of it really. We have a * ring buffer of stored hypercalls which the Host will run though next time we - * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall + * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall * arguments, and a "hcall_status" word which is 0 if the call is ready to go, * and 255 once the Host has finished with it. * * If we come around to a slot which hasn't been finished, then the table is * full and we just make the hypercall directly. This has the nice side * effect of causing the Host to run all the stored calls in the ring buffer - * which empties it for next time! */ + * which empties it for next time! + */ static void async_hcall(unsigned long call, unsigned long arg1, - unsigned long arg2, unsigned long arg3) + unsigned long arg2, unsigned long arg3, + unsigned long arg4) { /* Note: This code assumes we're uniprocessor. */ static unsigned int next_call; unsigned long flags; - /* Disable interrupts if not already disabled: we don't want an + /* + * Disable interrupts if not already disabled: we don't want an * interrupt handler making a hypercall while we're already doing - * one! */ + * one! + */ local_irq_save(flags); if (lguest_data.hcall_status[next_call] != 0xFF) { /* Table full, so do normal hcall which will flush table. */ - kvm_hypercall3(call, arg1, arg2, arg3); + kvm_hypercall4(call, arg1, arg2, arg3, arg4); } else { lguest_data.hcalls[next_call].arg0 = call; lguest_data.hcalls[next_call].arg1 = arg1; lguest_data.hcalls[next_call].arg2 = arg2; lguest_data.hcalls[next_call].arg3 = arg3; + lguest_data.hcalls[next_call].arg4 = arg4; /* Arguments must all be written before we mark it to go */ wmb(); lguest_data.hcall_status[next_call] = 0; @@ -123,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1, local_irq_restore(flags); } -/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first - * real optimization trick! +/*G:035 + * Notice the lazy_hcall() above, rather than hcall(). This is our first real + * optimization trick! * * When lazy_mode is set, it means we're allowed to defer all hypercalls and do * them as a batch when lazy_mode is eventually turned off. Because hypercalls @@ -134,16 +143,18 @@ static void async_hcall(unsigned long call, unsigned long arg1, * lguest_leave_lazy_mode(). * * So, when we're in lazy mode, we call async_hcall() to store the call for - * future processing: */ + * future processing: + */ static void lazy_hcall1(unsigned long call, unsigned long arg1) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall1(call, arg1); else - async_hcall(call, arg1, 0, 0); + async_hcall(call, arg1, 0, 0, 0); } +/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ static void lazy_hcall2(unsigned long call, unsigned long arg1, unsigned long arg2) @@ -151,7 +162,7 @@ static void lazy_hcall2(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall2(call, arg1, arg2); else - async_hcall(call, arg1, arg2, 0); + async_hcall(call, arg1, arg2, 0, 0); } static void lazy_hcall3(unsigned long call, @@ -162,18 +173,40 @@ static void lazy_hcall3(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall3(call, arg1, arg2, arg3); else - async_hcall(call, arg1, arg2, arg3); + async_hcall(call, arg1, arg2, arg3, 0); } -/* When lazy mode is turned off reset the per-cpu lazy mode variable and then - * issue the do-nothing hypercall to flush any stored calls. */ -static void lguest_leave_lazy_mode(void) +#ifdef CONFIG_X86_PAE +static void lazy_hcall4(unsigned long call, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4) +{ + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) + kvm_hypercall4(call, arg1, arg2, arg3, arg4); + else + async_hcall(call, arg1, arg2, arg3, arg4); +} +#endif + +/*G:036 + * When lazy mode is turned off reset the per-cpu lazy mode variable and then + * issue the do-nothing hypercall to flush any stored calls. +:*/ +static void lguest_leave_lazy_mmu_mode(void) +{ + kvm_hypercall0(LHCALL_FLUSH_ASYNC); + paravirt_leave_lazy_mmu(); +} + +static void lguest_end_context_switch(struct task_struct *next) { - paravirt_leave_lazy(paravirt_get_lazy_mode()); kvm_hypercall0(LHCALL_FLUSH_ASYNC); + paravirt_end_context_switch(next); } -/*G:033 +/*G:032 * After that diversion we return to our first native-instruction * replacements: four functions for interrupt control. * @@ -186,49 +219,51 @@ static void lguest_leave_lazy_mode(void) * check there before it tries to deliver an interrupt. */ -/* save_flags() is expected to return the processor state (ie. "flags"). The +/* + * save_flags() is expected to return the processor state (ie. "flags"). The * flags word contains all kind of stuff, but in practice Linux only cares - * about the interrupt flag. Our "save_flags()" just returns that. */ + * about the interrupt flag. Our "save_flags()" just returns that. + */ static unsigned long save_fl(void) { return lguest_data.irq_enabled; } -PV_CALLEE_SAVE_REGS_THUNK(save_fl); - -/* restore_flags() just sets the flags back to the value given. */ -static void restore_fl(unsigned long flags) -{ - lguest_data.irq_enabled = flags; -} -PV_CALLEE_SAVE_REGS_THUNK(restore_fl); /* Interrupts go off... */ static void irq_disable(void) { lguest_data.irq_enabled = 0; } + +/* + * Let's pause a moment. Remember how I said these are called so often? + * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to + * break some rules. In particular, these functions are assumed to save their + * own registers if they need to: normal C functions assume they can trash the + * eax register. To use normal C functions, we use + * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the + * C function, then restores it. + */ +PV_CALLEE_SAVE_REGS_THUNK(save_fl); PV_CALLEE_SAVE_REGS_THUNK(irq_disable); +/*:*/ -/* Interrupts go on... */ -static void irq_enable(void) -{ - lguest_data.irq_enabled = X86_EFLAGS_IF; -} -PV_CALLEE_SAVE_REGS_THUNK(irq_enable); +/* These are in i386_head.S */ +extern void lg_irq_enable(void); +extern void lg_restore_fl(unsigned long flags); -/*:*/ -/*M:003 Note that we don't check for outstanding interrupts when we re-enable - * them (or when we unmask an interrupt). This seems to work for the moment, - * since interrupts are rare and we'll just get the interrupt on the next timer - * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way - * would be to put the "irq_enabled" field in a page by itself, and have the - * Host write-protect it when an interrupt comes in when irqs are disabled. - * There will then be a page fault as soon as interrupts are re-enabled. +/*M:003 + * We could be more efficient in our checking of outstanding interrupts, rather + * than using a branch. One way would be to put the "irq_enabled" field in a + * page by itself, and have the Host write-protect it when an interrupt comes + * in when irqs are disabled. There will then be a page fault as soon as + * interrupts are re-enabled. * * A better method is to implement soft interrupt disable generally for x86: * instead of disabling interrupts, we set a flag. If an interrupt does come * in, we then disable them for real. This is uncommon, so we could simply use - * a hypercall for interrupt control and not worry about efficiency. :*/ + * a hypercall for interrupt control and not worry about efficiency. +:*/ /*G:034 * The Interrupt Descriptor Table (IDT). @@ -241,10 +276,12 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_enable); static void lguest_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) { - /* The gate_desc structure is 8 bytes long: we hand it to the Host in + /* + * The gate_desc structure is 8 bytes long: we hand it to the Host in * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors * around like this; typesafety wasn't a big concern in Linux's early - * years. */ + * years. + */ u32 *desc = (u32 *)g; /* Keep the local copy up to date. */ native_write_idt_entry(dt, entrynum, g); @@ -252,9 +289,11 @@ static void lguest_write_idt_entry(gate_desc *dt, kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); } -/* Changing to a different IDT is very rare: we keep the IDT up-to-date every +/* + * Changing to a different IDT is very rare: we keep the IDT up-to-date every * time it is written, so we can simply loop through all entries and tell the - * Host about them. */ + * Host about them. + */ static void lguest_load_idt(const struct desc_ptr *desc) { unsigned int i; @@ -285,9 +324,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc) kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); } -/* For a single GDT entry which changes, we do the lazy thing: alter our GDT, +/* + * For a single GDT entry which changes, we do the lazy thing: alter our GDT, * then tell the Host to reload the entire thing. This operation is so rare - * that this naive implementation is reasonable. */ + * that this naive implementation is reasonable. + */ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, const void *desc, int type) { @@ -297,29 +338,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, dt[entrynum].a, dt[entrynum].b); } -/* OK, I lied. There are three "thread local storage" GDT entries which change +/* + * OK, I lied. There are three "thread local storage" GDT entries which change * on every context switch (these three entries are how glibc implements - * __thread variables). So we have a hypercall specifically for this case. */ + * __thread variables). So we have a hypercall specifically for this case. + */ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) { - /* There's one problem which normal hardware doesn't have: the Host + /* + * There's one problem which normal hardware doesn't have: the Host * can't handle us removing entries we're currently using. So we clear - * the GS register here: if it's needed it'll be reloaded anyway. */ + * the GS register here: if it's needed it'll be reloaded anyway. + */ lazy_load_gs(0); lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); } -/*G:038 That's enough excitement for now, back to ploughing through each of - * the different pv_ops structures (we're about 1/3 of the way through). +/*G:038 + * That's enough excitement for now, back to ploughing through each of the + * different pv_ops structures (we're about 1/3 of the way through). * * This is the Local Descriptor Table, another weird Intel thingy. Linux only * uses this for some strange applications like Wine. We don't do anything - * here, so they'll get an informative and friendly Segmentation Fault. */ + * here, so they'll get an informative and friendly Segmentation Fault. + */ static void lguest_set_ldt(const void *addr, unsigned entries) { } -/* This loads a GDT entry into the "Task Register": that entry points to a +/* + * This loads a GDT entry into the "Task Register": that entry points to a * structure called the Task State Segment. Some comments scattered though the * kernel code indicate that this used for task switching in ages past, along * with blood sacrifice and astrology. @@ -327,19 +375,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries) * Now there's nothing interesting in here that we don't get told elsewhere. * But the native version uses the "ltr" instruction, which makes the Host * complain to the Guest about a Segmentation Fault and it'll oops. So we - * override the native version with a do-nothing version. */ + * override the native version with a do-nothing version. + */ static void lguest_load_tr_desc(void) { } -/* The "cpuid" instruction is a way of querying both the CPU identity +/* + * The "cpuid" instruction is a way of querying both the CPU identity * (manufacturer, model, etc) and its features. It was introduced before the * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. * As you might imagine, after a decade and a half this treatment, it is now a * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. * * This instruction even it has its own Wikipedia entry. The Wikipedia entry - * has been translated into 4 languages. I am not making this up! + * has been translated into 5 languages. I am not making this up! * * We could get funky here and identify ourselves as "GenuineLguest", but * instead we just use the real "cpuid" instruction. Then I pretty much turned @@ -351,7 +401,8 @@ static void lguest_load_tr_desc(void) * Replacing the cpuid so we can turn features off is great for the kernel, but * anyone (including userspace) can just use the raw "cpuid" instruction and * the Host won't even notice since it isn't privileged. So we try not to get - * too worked up about it. */ + * too worked up about it. + */ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { @@ -359,33 +410,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, native_cpuid(ax, bx, cx, dx); switch (function) { - case 1: /* Basic feature request. */ - /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ + /* + * CPUID 0 gives the highest legal CPUID number (and the ID string). + * We futureproof our code a little by sticking to known CPUID values. + */ + case 0: + if (*ax > 5) + *ax = 5; + break; + + /* + * CPUID 1 is a basic feature request. + * + * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 + * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. + */ + case 1: *cx &= 0x00002201; - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ - *dx &= 0x07808111; - /* The Host can do a nice optimization if it knows that the + *dx &= 0x07808151; + /* + * The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls * flush_tlb_user() for both user and kernel mappings unless - * the Page Global Enable (PGE) feature bit is set. */ + * the Page Global Enable (PGE) feature bit is set. + */ *dx |= 0x00002000; - /* We also lie, and say we're family id 5. 6 or greater + /* + * We also lie, and say we're family id 5. 6 or greater * leads to a rdmsr in early_init_intel which we can't handle. - * Family ID is returned as bits 8-12 in ax. */ + * Family ID is returned as bits 8-12 in ax. + */ *ax &= 0xFFFFF0FF; *ax |= 0x00000500; break; + /* + * 0x80000000 returns the highest Extended Function, so we futureproof + * like we do above by limiting it to known fields. + */ case 0x80000000: - /* Futureproof this a little: if they ask how much extended - * processor information there is, limit it to known fields. */ if (*ax > 0x80000008) *ax = 0x80000008; break; + + /* + * PAE systems can mark pages as non-executable. Linux calls this the + * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced + * Virus Protection). We just switch turn if off here, since we don't + * support it. + */ + case 0x80000001: + *dx &= ~(1 << 20); + break; } } -/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. +/* + * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother * it. The Host needs to know when the Guest wants to change them, so we have * a whole series of functions like read_cr0() and write_cr0(). @@ -400,7 +481,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, * name like "FPUTRAP bit" be a little less cryptic? * * We store cr0 locally because the Host never changes it. The Guest sometimes - * wants to read it and we'd prefer not to bother the Host unnecessarily. */ + * wants to read it and we'd prefer not to bother the Host unnecessarily. + */ static unsigned long current_cr0; static void lguest_write_cr0(unsigned long val) { @@ -413,18 +495,22 @@ static unsigned long lguest_read_cr0(void) return current_cr0; } -/* Intel provided a special instruction to clear the TS bit for people too cool +/* + * Intel provided a special instruction to clear the TS bit for people too cool * to use write_cr0() to do it. This "clts" instruction is faster, because all - * the vowels have been optimized out. */ + * the vowels have been optimized out. + */ static void lguest_clts(void) { lazy_hcall1(LHCALL_TS, 0); current_cr0 &= ~X86_CR0_TS; } -/* cr2 is the virtual address of the last page fault, which the Guest only ever +/* + * cr2 is the virtual address of the last page fault, which the Guest only ever * reads. The Host kindly writes this into our "struct lguest_data", so we - * just read it out of there. */ + * just read it out of there. + */ static unsigned long lguest_read_cr2(void) { return lguest_data.cr2; @@ -433,10 +519,12 @@ static unsigned long lguest_read_cr2(void) /* See lguest_set_pte() below. */ static bool cr3_changed = false; -/* cr3 is the current toplevel pagetable page: the principle is the same as +/* + * cr3 is the current toplevel pagetable page: the principle is the same as * cr0. Keep a local copy, and tell the Host when it changes. The only * difference is that our local copy is in lguest_data because the Host needs - * to set it upon our initial hypercall. */ + * to set it upon our initial hypercall. + */ static void lguest_write_cr3(unsigned long cr3) { lguest_data.pgdir = cr3; @@ -481,7 +569,7 @@ static void lguest_write_cr4(unsigned long val) * cr3 ---> +---------+ * | --------->+---------+ * | | | PADDR1 | - * Top-level | | PADDR2 | + * Mid-level | | PADDR2 | * (PMD) page | | | * | | Lower-level | * | | (PTE) page | @@ -501,41 +589,113 @@ static void lguest_write_cr4(unsigned long val) * Index into top Index into second Offset within page * page directory page pagetable page * - * The kernel spends a lot of time changing both the top-level page directory - * and lower-level pagetable pages. The Guest doesn't know physical addresses, - * so while it maintains these page tables exactly like normal, it also needs - * to keep the Host informed whenever it makes a change: the Host will create - * the real page tables based on the Guests'. + * Now, unfortunately, this isn't the whole story: Intel added Physical Address + * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). + * These are held in 64-bit page table entries, so we can now only fit 512 + * entries in a page, and the neat three-level tree breaks down. + * + * The result is a four level page table: + * + * cr3 --> [ 4 Upper ] + * [ Level ] + * [ Entries ] + * [(PUD Page)]---> +---------+ + * | --------->+---------+ + * | | | PADDR1 | + * Mid-level | | PADDR2 | + * (PMD) page | | | + * | | Lower-level | + * | | (PTE) page | + * | | | | + * .... .... + * + * + * And the virtual address is decoded as: + * + * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| + * Index into Index into mid Index into lower Offset within page + * top entries directory page pagetable page + * + * It's too hard to switch between these two formats at runtime, so Linux only + * supports one or the other depending on whether CONFIG_X86_PAE is set. Many + * distributions turn it on, and not just for people with silly amounts of + * memory: the larger PTE entries allow room for the NX bit, which lets the + * kernel disable execution of pages and increase security. + * + * This was a problem for lguest, which couldn't run on these distributions; + * then Matias Zabaljauregui figured it all out and implemented it, and only a + * handful of puppies were crushed in the process! + * + * Back to our point: the kernel spends a lot of time changing both the + * top-level page directory and lower-level pagetable pages. The Guest doesn't + * know physical addresses, so while it maintains these page tables exactly + * like normal, it also needs to keep the Host informed whenever it makes a + * change: the Host will create the real page tables based on the Guests'. */ -/* The Guest calls this to set a second-level entry (pte), ie. to map a page - * into a process' address space. We set the entry then tell the Host the - * toplevel and address this corresponds to. The Guest uses one pagetable per - * process, so we need to tell the Host which one we're changing (mm->pgd). */ +/* + * The Guest calls this after it has set a second-level entry (pte), ie. to map + * a page into a process' address space. Wetell the Host the toplevel and + * address this corresponds to. The Guest uses one pagetable per process, so + * we need to tell the Host which one we're changing (mm->pgd). + */ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { +#ifdef CONFIG_X86_PAE + /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ + lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, + ptep->pte_low, ptep->pte_high); +#else lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); +#endif } +/* This is the "set and update" combo-meal-deal version. */ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - *ptep = pteval; + native_set_pte(ptep, pteval); lguest_pte_update(mm, addr, ptep); } -/* The Guest calls this to set a top-level entry. Again, we set the entry then - * tell the Host which top-level page we changed, and the index of the entry we - * changed. */ +/* + * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd + * to set a middle-level entry when PAE is activated. + * + * Again, we set the entry then tell the Host which page we changed, + * and the index of the entry we changed. + */ +#ifdef CONFIG_X86_PAE +static void lguest_set_pud(pud_t *pudp, pud_t pudval) +{ + native_set_pud(pudp, pudval); + + /* 32 bytes aligned pdpt address and the index. */ + lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, + (__pa(pudp) & 0x1F) / sizeof(pud_t)); +} + static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { - *pmdp = pmdval; + native_set_pmd(pmdp, pmdval); lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / 4); + (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); } +#else + +/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ +static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + native_set_pmd(pmdp, pmdval); + lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, + (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); +} +#endif -/* There are a couple of legacy places where the kernel sets a PTE, but we +/* + * There are a couple of legacy places where the kernel sets a PTE, but we * don't know the top level any more. This is useless for us, since we don't * know which pagetable is changing or what address, so we just tell the Host * to forget all of them. Fortunately, this is very rare. @@ -543,15 +703,43 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) * ... except in early boot when the kernel sets up the initial pagetables, * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell * the Host anything changed until we've done the first page table switch, - * which brings boot back to 0.25 seconds. */ + * which brings boot back to 0.25 seconds. + */ static void lguest_set_pte(pte_t *ptep, pte_t pteval) { - *ptep = pteval; + native_set_pte(ptep, pteval); if (cr3_changed) lazy_hcall1(LHCALL_FLUSH_TLB, 1); } -/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on +#ifdef CONFIG_X86_PAE +/* + * With 64-bit PTE values, we need to be careful setting them: if we set 32 + * bits at a time, the hardware could see a weird half-set entry. These + * versions ensure we update all 64 bits at once. + */ +static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + native_set_pte_atomic(ptep, pte); + if (cr3_changed) + lazy_hcall1(LHCALL_FLUSH_TLB, 1); +} + +static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + native_pte_clear(mm, addr, ptep); + lguest_pte_update(mm, addr, ptep); +} + +static void lguest_pmd_clear(pmd_t *pmdp) +{ + lguest_set_pmd(pmdp, __pmd(0)); +} +#endif + +/* + * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on * native page table operations. On native hardware you can set a new page * table entry whenever you want, but if you want to remove one you have to do * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). @@ -560,24 +748,29 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) * called when a valid entry is written, not when it's removed (ie. marked not * present). Instead, this is where we come when the Guest wants to remove a * page table entry: we tell the Host to set that entry to 0 (ie. the present - * bit is zero). */ + * bit is zero). + */ static void lguest_flush_tlb_single(unsigned long addr) { /* Simply set it to zero: if it was not, it will fault back in. */ lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); } -/* This is what happens after the Guest has removed a large number of entries. +/* + * This is what happens after the Guest has removed a large number of entries. * This tells the Host that any of the page table entries for userspace might - * have changed, ie. virtual addresses below PAGE_OFFSET. */ + * have changed, ie. virtual addresses below PAGE_OFFSET. + */ static void lguest_flush_tlb_user(void) { lazy_hcall1(LHCALL_FLUSH_TLB, 0); } -/* This is called when the kernel page tables have changed. That's not very +/* + * This is called when the kernel page tables have changed. That's not very * common (unless the Guest is using highmem, which makes the Guest extremely - * slow), so it's worth separating this from the user flushing above. */ + * slow), so it's worth separating this from the user flushing above. + */ static void lguest_flush_tlb_kernel(void) { lazy_hcall1(LHCALL_FLUSH_TLB, 1); @@ -614,30 +807,41 @@ static struct irq_chip lguest_irq_controller = { .unmask = enable_lguest_irq, }; -/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware +/* + * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware * interrupt (except 128, which is used for system calls), and then tells the * Linux infrastructure that each interrupt is controlled by our level-based - * lguest interrupt controller. */ + * lguest interrupt controller. + */ static void __init lguest_init_IRQ(void) { unsigned int i; - for (i = 0; i < LGUEST_IRQS; i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - /* Some systems map "vectors" to interrupts weirdly. Lguest has - * a straightforward 1 to 1 mapping, so force that here. */ - __get_cpu_var(vector_irq)[vector] = i; - if (vector != SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + /* Some systems map "vectors" to interrupts weirdly. Not us! */ + __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); } - /* This call is required to set up for 4k stacks, where we have - * separate stacks for hard and soft interrupts. */ + + /* + * This call is required to set up for 4k stacks, where we have + * separate stacks for hard and soft interrupts. + */ irq_ctx_init(smp_processor_id()); } +/* + * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so + * rather than set them in lguest_init_IRQ we are called here every time an + * lguest device needs an interrupt. + * + * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should + * pass that up! + */ void lguest_setup_irq(unsigned int irq) { - irq_to_desc_alloc_cpu(irq, 0); + irq_to_desc_alloc_node(irq, 0); set_irq_chip_and_handler_name(irq, &lguest_irq_controller, handle_level_irq, "level"); } @@ -653,31 +857,39 @@ static unsigned long lguest_get_wallclock(void) return lguest_data.time.tv_sec; } -/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us +/* + * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us * what speed it runs at, or 0 if it's unusable as a reliable clock source. * This matches what we want here: if we return 0 from this function, the x86 - * TSC clock will give up and not register itself. */ + * TSC clock will give up and not register itself. + */ static unsigned long lguest_tsc_khz(void) { return lguest_data.tsc_khz; } -/* If we can't use the TSC, the kernel falls back to our lower-priority - * "lguest_clock", where we read the time value given to us by the Host. */ +/* + * If we can't use the TSC, the kernel falls back to our lower-priority + * "lguest_clock", where we read the time value given to us by the Host. + */ static cycle_t lguest_clock_read(struct clocksource *cs) { unsigned long sec, nsec; - /* Since the time is in two parts (seconds and nanoseconds), we risk + /* + * Since the time is in two parts (seconds and nanoseconds), we risk * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, * and getting 99 and 0. As Linux tends to come apart under the stress - * of time travel, we must be careful: */ + * of time travel, we must be careful: + */ do { /* First we read the seconds part. */ sec = lguest_data.time.tv_sec; - /* This read memory barrier tells the compiler and the CPU that + /* + * This read memory barrier tells the compiler and the CPU that * this can't be reordered: we have to complete the above - * before going on. */ + * before going on. + */ rmb(); /* Now we read the nanoseconds part. */ nsec = lguest_data.time.tv_nsec; @@ -701,9 +913,11 @@ static struct clocksource lguest_clock = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -/* We also need a "struct clock_event_device": Linux asks us to set it to go +/* + * We also need a "struct clock_event_device": Linux asks us to set it to go * off some time in the future. Actually, James Morris figured all this out, I - * just applied the patch. */ + * just applied the patch. + */ static int lguest_clockevent_set_next_event(unsigned long delta, struct clock_event_device *evt) { @@ -753,8 +967,10 @@ static struct clock_event_device lguest_clockevent = { .max_delta_ns = LG_CLOCK_MAX_DELTA, }; -/* This is the Guest timer interrupt handler (hardware interrupt 0). We just - * call the clockevent infrastructure and it does whatever needs doing. */ +/* + * This is the Guest timer interrupt handler (hardware interrupt 0). We just + * call the clockevent infrastructure and it does whatever needs doing. + */ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) { unsigned long flags; @@ -765,10 +981,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) local_irq_restore(flags); } -/* At some point in the boot process, we get asked to set up our timing +/* + * At some point in the boot process, we get asked to set up our timing * infrastructure. The kernel doesn't expect timer interrupts before this, but * we cleverly initialized the "blocked_interrupts" field of "struct - * lguest_data" so that timer interrupts were blocked until now. */ + * lguest_data" so that timer interrupts were blocked until now. + */ static void lguest_time_init(void) { /* Set up the timer interrupt (0) to go to our simple timer routine */ @@ -792,14 +1010,16 @@ static void lguest_time_init(void) * to work. They're pretty simple. */ -/* The Guest needs to tell the Host what stack it expects traps to use. For +/* + * The Guest needs to tell the Host what stack it expects traps to use. For * native hardware, this is part of the Task State Segment mentioned above in * lguest_load_tr_desc(), but to help hypervisors there's this special call. * * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data * segment), the privilege level (we're privilege level 1, the Host is 0 and * will not tolerate us trying to use that), the stack pointer, and the number - * of pages in the stack. */ + * of pages in the stack. + */ static void lguest_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { @@ -813,7 +1033,8 @@ static void lguest_set_debugreg(int regno, unsigned long value) /* FIXME: Implement */ } -/* There are times when the kernel wants to make sure that no memory writes are +/* + * There are times when the kernel wants to make sure that no memory writes are * caught in the cache (that they've all reached real hardware devices). This * doesn't matter for the Guest which has virtual hardware. * @@ -827,11 +1048,13 @@ static void lguest_wbinvd(void) { } -/* If the Guest expects to have an Advanced Programmable Interrupt Controller, +/* + * If the Guest expects to have an Advanced Programmable Interrupt Controller, * we play dumb by ignoring writes and returning 0 for reads. So it's no * longer Programmable nor Controlling anything, and I don't think 8 lines of * code qualifies for Advanced. It will also never interrupt anything. It - * does, however, allow us to get through the Linux boot code. */ + * does, however, allow us to get through the Linux boot code. + */ #ifdef CONFIG_X86_LOCAL_APIC static void lguest_apic_write(u32 reg, u32 v) { @@ -880,11 +1103,13 @@ static void lguest_safe_halt(void) kvm_hypercall0(LHCALL_HALT); } -/* The SHUTDOWN hypercall takes a string to describe what's happening, and +/* + * The SHUTDOWN hypercall takes a string to describe what's happening, and * an argument which says whether this to restart (reboot) the Guest or not. * * Note that the Host always prefers that the Guest speak in physical addresses - * rather than virtual addresses, so we use __pa() here. */ + * rather than virtual addresses, so we use __pa() here. + */ static void lguest_power_off(void) { kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), @@ -915,8 +1140,10 @@ static __init char *lguest_memory_setup(void) * nice to move it back to lguest_init. Patch welcome... */ atomic_notifier_chain_register(&panic_notifier_list, &paniced); - /* The Linux bootloader header contains an "e820" memory map: the - * Launcher populated the first entry with our memory limit. */ + /* + *The Linux bootloader header contains an "e820" memory map: the + * Launcher populated the first entry with our memory limit. + */ e820_add_region(boot_params.e820_map[0].addr, boot_params.e820_map[0].size, boot_params.e820_map[0].type); @@ -925,16 +1152,17 @@ static __init char *lguest_memory_setup(void) return "LGUEST"; } -/* We will eventually use the virtio console device to produce console output, +/* + * We will eventually use the virtio console device to produce console output, * but before that is set up we use LHCALL_NOTIFY on normal memory to produce - * console output. */ + * console output. + */ static __init int early_put_chars(u32 vtermno, const char *buf, int count) { char scratch[17]; unsigned int len = count; - /* We use a nul-terminated string, so we have to make a copy. Icky, - * huh? */ + /* We use a nul-terminated string, so we make a copy. Icky, huh? */ if (len > sizeof(scratch) - 1) len = sizeof(scratch) - 1; scratch[len] = '\0'; @@ -945,8 +1173,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) return len; } -/* Rebooting also tells the Host we're finished, but the RESTART flag tells the - * Launcher to reboot us. */ +/* + * Rebooting also tells the Host we're finished, but the RESTART flag tells the + * Launcher to reboot us. + */ static void lguest_restart(char *reason) { kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); @@ -967,13 +1197,14 @@ static void lguest_restart(char *reason) * * Our current solution is to allow the paravirt back end to optionally patch * over the indirect calls to replace them with something more efficient. We - * patch the four most commonly called functions: disable interrupts, enable - * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 - * bytes to patch into: the Guest versions of these operations are small enough - * that we can fit comfortably. + * patch two of the simplest of the most commonly called functions: disable + * interrupts and save interrupts. We usually have 6 or 10 bytes to patch + * into: the Guest versions of these operations are small enough that we can + * fit comfortably. * * First we need assembly templates of each of the patchable Guest operations, - * and these are in i386_head.S. */ + * and these are in i386_head.S. + */ /*G:060 We construct a table from the assembler templates: */ static const struct lguest_insns @@ -981,14 +1212,14 @@ static const struct lguest_insns const char *start, *end; } lguest_insns[] = { [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, - [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti }, - [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf }, [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, }; -/* Now our patch routine is fairly simple (based on the native one in +/* + * Now our patch routine is fairly simple (based on the native one in * paravirt.c). If we have a replacement, we copy it in and return how much of - * the available space we used. */ + * the available space we used. + */ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, unsigned long addr, unsigned len) { @@ -1000,8 +1231,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, insn_len = lguest_insns[type].end - lguest_insns[type].start; - /* Similarly if we can't fit replacement (shouldn't happen, but let's - * be thorough). */ + /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ if (len < insn_len) return paravirt_patch_default(type, clobber, ibuf, addr, len); @@ -1010,33 +1240,40 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, return insn_len; } -/*G:030 Once we get to lguest_init(), we know we're a Guest. The various +/*G:029 + * Once we get to lguest_init(), we know we're a Guest. The various * pv_ops structures in the kernel provide points for (almost) every routine we - * have to override to avoid privileged instructions. */ + * have to override to avoid privileged instructions. + */ __init void lguest_init(void) { - /* We're under lguest, paravirt is enabled, and we're running at - * privilege level 1, not 0 as normal. */ + /* We're under lguest. */ pv_info.name = "lguest"; + /* Paravirt is enabled. */ pv_info.paravirt_enabled = 1; + /* We're running at privilege level 1, not 0 as normal. */ pv_info.kernel_rpl = 1; + /* Everyone except Xen runs with this set. */ + pv_info.shared_kernel_pmd = 1; - /* We set up all the lguest overrides for sensitive operations. These - * are detailed with the operations themselves. */ + /* + * We set up all the lguest overrides for sensitive operations. These + * are detailed with the operations themselves. + */ - /* interrupt-related operations */ + /* Interrupt-related operations */ pv_irq_ops.init_IRQ = lguest_init_IRQ; pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); - pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); + pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); - pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); + pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); pv_irq_ops.safe_halt = lguest_safe_halt; - /* init-time operations */ + /* Setup operations */ pv_init_ops.memory_setup = lguest_memory_setup; pv_init_ops.patch = lguest_patch; - /* Intercepts of various cpu instructions */ + /* Intercepts of various CPU instructions */ pv_cpu_ops.load_gdt = lguest_load_gdt; pv_cpu_ops.cpuid = lguest_cpuid; pv_cpu_ops.load_idt = lguest_load_idt; @@ -1054,10 +1291,10 @@ __init void lguest_init(void) pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; pv_cpu_ops.wbinvd = lguest_wbinvd; - pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu; - pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode; + pv_cpu_ops.start_context_switch = paravirt_start_context_switch; + pv_cpu_ops.end_context_switch = lguest_end_context_switch; - /* pagetable management */ + /* Pagetable management */ pv_mmu_ops.write_cr3 = lguest_write_cr3; pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; @@ -1065,62 +1302,85 @@ __init void lguest_init(void) pv_mmu_ops.set_pte = lguest_set_pte; pv_mmu_ops.set_pte_at = lguest_set_pte_at; pv_mmu_ops.set_pmd = lguest_set_pmd; +#ifdef CONFIG_X86_PAE + pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; + pv_mmu_ops.pte_clear = lguest_pte_clear; + pv_mmu_ops.pmd_clear = lguest_pmd_clear; + pv_mmu_ops.set_pud = lguest_set_pud; +#endif pv_mmu_ops.read_cr2 = lguest_read_cr2; pv_mmu_ops.read_cr3 = lguest_read_cr3; pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; - pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode; + pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; pv_mmu_ops.pte_update = lguest_pte_update; pv_mmu_ops.pte_update_defer = lguest_pte_update; #ifdef CONFIG_X86_LOCAL_APIC - /* apic read/write intercepts */ + /* APIC read/write intercepts */ set_lguest_basic_apic_ops(); #endif - /* time operations */ + /* Time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; pv_time_ops.time_init = lguest_time_init; pv_time_ops.get_tsc_khz = lguest_tsc_khz; - /* Now is a good time to look at the implementations of these functions - * before returning to the rest of lguest_init(). */ + /* + * Now is a good time to look at the implementations of these functions + * before returning to the rest of lguest_init(). + */ - /*G:070 Now we've seen all the paravirt_ops, we return to + /*G:070 + * Now we've seen all the paravirt_ops, we return to * lguest_init() where the rest of the fairly chaotic boot setup - * occurs. */ + * occurs. + */ - /* The stack protector is a weird thing where gcc places a canary + /* + * The stack protector is a weird thing where gcc places a canary * value on the stack and then checks it on return. This file is * compiled with -fno-stack-protector it, so we got this far without * problems. The value of the canary is kept at offset 20 from the * %gs register, so we need to set that up before calling C functions - * in other files. */ + * in other files. + */ setup_stack_canary_segment(0); - /* We could just call load_stack_canary_segment(), but we might as - * call switch_to_new_gdt() which loads the whole table and sets up - * the per-cpu segment descriptor register %fs as well. */ + + /* + * We could just call load_stack_canary_segment(), but we might as well + * call switch_to_new_gdt() which loads the whole table and sets up the + * per-cpu segment descriptor register %fs as well. + */ switch_to_new_gdt(0); - /* As described in head_32.S, we map the first 128M of memory. */ + /* We actually boot with all memory mapped, but let's say 128MB. */ max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; - /* The Host<->Guest Switcher lives at the top of our address space, and + /* + * The Host<->Guest Switcher lives at the top of our address space, and * the Host told us how big it is when we made LGUEST_INIT hypercall: - * it put the answer in lguest_data.reserve_mem */ + * it put the answer in lguest_data.reserve_mem + */ reserve_top_address(lguest_data.reserve_mem); - /* If we don't initialize the lock dependency checker now, it crashes - * paravirt_disable_iospace. */ + /* + * If we don't initialize the lock dependency checker now, it crashes + * paravirt_disable_iospace. + */ lockdep_init(); - /* The IDE code spends about 3 seconds probing for disks: if we reserve + /* + * The IDE code spends about 3 seconds probing for disks: if we reserve * all the I/O ports up front it can't get them and so doesn't probe. * Other device drivers are similar (but less severe). This cuts the - * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ + * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. + */ paravirt_disable_iospace(); - /* This is messy CPU setup stuff which the native boot code does before - * start_kernel, so we have to do, too: */ + /* + * This is messy CPU setup stuff which the native boot code does before + * start_kernel, so we have to do, too: + */ cpu_detect(&new_cpu_data); /* head.S usually sets up the first capability word, so do it here. */ new_cpu_data.x86_capability[0] = cpuid_edx(1); @@ -1137,22 +1397,28 @@ __init void lguest_init(void) acpi_ht = 0; #endif - /* We set the preferred console to "hvc". This is the "hypervisor + /* + * We set the preferred console to "hvc". This is the "hypervisor * virtual console" driver written by the PowerPC people, which we also - * adapted for lguest's use. */ + * adapted for lguest's use. + */ add_preferred_console("hvc", 0, NULL); /* Register our very early console. */ virtio_cons_early_init(early_put_chars); - /* Last of all, we set the power management poweroff hook to point to + /* + * Last of all, we set the power management poweroff hook to point to * the Guest routine to power off, and the reboot hook to our restart - * routine. */ + * routine. + */ pm_power_off = lguest_power_off; machine_ops.restart = lguest_restart; - /* Now we're set up, call i386_start_kernel() in head32.c and we proceed - * to boot as normal. It never returns. */ + /* + * Now we're set up, call i386_start_kernel() in head32.c and we proceed + * to boot as normal. It never returns. + */ i386_start_kernel(); } /* diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index f7954198947..27eac0faee4 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -5,7 +5,8 @@ #include <asm/thread_info.h> #include <asm/processor-flags.h> -/*G:020 Our story starts with the kernel booting into startup_32 in +/*G:020 + * Our story starts with the kernel booting into startup_32 in * arch/x86/kernel/head_32.S. It expects a boot header, which is created by * the bootloader (the Launcher in our case). * @@ -21,11 +22,14 @@ * data without remembering to subtract __PAGE_OFFSET! * * The .section line puts this code in .init.text so it will be discarded after - * boot. */ + * boot. + */ .section .init.text, "ax", @progbits ENTRY(lguest_entry) - /* We make the "initialization" hypercall now to tell the Host about - * us, and also find out where it put our page tables. */ + /* + * We make the "initialization" hypercall now to tell the Host about + * us, and also find out where it put our page tables. + */ movl $LHCALL_LGUEST_INIT, %eax movl $lguest_data - __PAGE_OFFSET, %ebx .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ @@ -33,44 +37,119 @@ ENTRY(lguest_entry) /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp - /* Jumps are relative, and we're running __PAGE_OFFSET too low at the - * moment. */ + /* Jumps are relative: we're running __PAGE_OFFSET too low. */ jmp lguest_init+__PAGE_OFFSET -/*G:055 We create a macro which puts the assembler code between lgstart_ and - * lgend_ markers. These templates are put in the .text section: they can't be - * discarded after boot as we may need to patch modules, too. */ +/*G:055 + * We create a macro which puts the assembler code between lgstart_ and lgend_ + * markers. These templates are put in the .text section: they can't be + * discarded after boot as we may need to patch modules, too. + */ .text #define LGUEST_PATCH(name, insns...) \ lgstart_##name: insns; lgend_##name:; \ .globl lgstart_##name; .globl lgend_##name LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) + +/*G:033 + * But using those wrappers is inefficient (we'll see why that doesn't matter + * for save_fl and irq_disable later). If we write our routines carefully in + * assembler, we can avoid clobbering any registers and avoid jumping through + * the wrapper functions. + * + * I skipped over our first piece of assembler, but this one is worth studying + * in a bit more detail so I'll describe in easy stages. First, the routine to + * enable interrupts: + */ +ENTRY(lg_irq_enable) + /* + * The reverse of irq_disable, this sets lguest_data.irq_enabled to + * X86_EFLAGS_IF (ie. "Interrupts enabled"). + */ + movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled + /* + * But now we need to check if the Host wants to know: there might have + * been interrupts waiting to be delivered, in which case it will have + * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we + * jump to send_interrupts, otherwise we're done. + */ + testl $0, lguest_data+LGUEST_DATA_irq_pending + jnz send_interrupts + /* + * One cool thing about x86 is that you can do many things without using + * a register. In this case, the normal path hasn't needed to save or + * restore any registers at all! + */ + ret +send_interrupts: + /* + * OK, now we need a register: eax is used for the hypercall number, + * which is LHCALL_SEND_INTERRUPTS. + * + * We used not to bother with this pending detection at all, which was + * much simpler. Sooner or later the Host would realize it had to + * send us an interrupt. But that turns out to make performance 7 + * times worse on a simple tcp benchmark. So now we do this the hard + * way. + */ + pushl %eax + movl $LHCALL_SEND_INTERRUPTS, %eax + /* + * This is a vmcall instruction (same thing that KVM uses). Older + * assembler versions might not know the "vmcall" instruction, so we + * create one manually here. + */ + .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ + /* Put eax back the way we found it. */ + popl %eax + ret + +/* + * Finally, the "popf" or "restore flags" routine. The %eax register holds the + * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're + * enabling interrupts again, if it's 0 we're leaving them off. + */ +ENTRY(lg_restore_fl) + /* This is just "lguest_data.irq_enabled = flags;" */ + movl %eax, lguest_data+LGUEST_DATA_irq_enabled + /* + * Now, if the %eax value has enabled interrupts and + * lguest_data.irq_pending is set, we want to tell the Host so it can + * deliver any outstanding interrupts. Fortunately, both values will + * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" + * instruction will AND them together for us. If both are set, we + * jump to send_interrupts. + */ + testl lguest_data+LGUEST_DATA_irq_pending, %eax + jnz send_interrupts + /* Again, the normal path has used no extra registers. Clever, huh? */ + ret /*:*/ /* These demark the EIP range where host should never deliver interrupts. */ .global lguest_noirq_start .global lguest_noirq_end -/*M:004 When the Host reflects a trap or injects an interrupt into the Guest, - * it sets the eflags interrupt bit on the stack based on - * lguest_data.irq_enabled, so the Guest iret logic does the right thing when - * restoring it. However, when the Host sets the Guest up for direct traps, - * such as system calls, the processor is the one to push eflags onto the - * stack, and the interrupt bit will be 1 (in reality, interrupts are always - * enabled in the Guest). +/*M:004 + * When the Host reflects a trap or injects an interrupt into the Guest, it + * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, + * so the Guest iret logic does the right thing when restoring it. However, + * when the Host sets the Guest up for direct traps, such as system calls, the + * processor is the one to push eflags onto the stack, and the interrupt bit + * will be 1 (in reality, interrupts are always enabled in the Guest). * * This turns out to be harmless: the only trap which should happen under Linux * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc * regions), which has to be reflected through the Host anyway. If another * trap *does* go off when interrupts are disabled, the Guest will panic, and - * we'll never get to this iret! :*/ + * we'll never get to this iret! +:*/ -/*G:045 There is one final paravirt_op that the Guest implements, and glancing - * at it you can see why I left it to last. It's *cool*! It's in *assembler*! +/*G:045 + * There is one final paravirt_op that the Guest implements, and glancing at it + * you can see why I left it to last. It's *cool*! It's in *assembler*! * * The "iret" instruction is used to return from an interrupt or trap. The * stack looks like this: @@ -94,15 +173,18 @@ LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) * return to userspace or wherever. Our solution to this is to surround the * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the * Host that it is *never* to interrupt us there, even if interrupts seem to be - * enabled. */ + * enabled. + */ ENTRY(lguest_iret) pushl %eax movl 12(%esp), %eax lguest_noirq_start: - /* Note the %ss: segment prefix here. Normal data accesses use the + /* + * Note the %ss: segment prefix here. Normal data accesses use the * "ds" segment, but that will have already been restored for whatever * we're returning to (such as userspace): we can't trust it. The %ss: - * prefix makes sure we use the stack segment, which is still valid. */ + * prefix makes sure we use the stack segment, which is still valid. + */ movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled popl %eax iret diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 55e11aa6d66..07c31899c9c 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -2,7 +2,7 @@ # Makefile for x86 specific library files. # -obj-$(CONFIG_SMP) := msr-on-cpu.o +obj-$(CONFIG_SMP) := msr.o lib-y := delay.o lib-y += thunk_$(BITS).o @@ -10,6 +10,7 @@ lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o ifeq ($(CONFIG_X86_32),y) + obj-y += atomic64_32.o lib-y += checksum_32.o lib-y += strstr_32.o lib-y += semaphore_32.o string_32.o diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c new file mode 100644 index 00000000000..824fa0be55a --- /dev/null +++ b/arch/x86/lib/atomic64_32.c @@ -0,0 +1,230 @@ +#include <linux/compiler.h> +#include <linux/module.h> +#include <linux/types.h> + +#include <asm/processor.h> +#include <asm/cmpxchg.h> +#include <asm/atomic.h> + +static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new) +{ + u32 low = new; + u32 high = new >> 32; + + asm volatile( + LOCK_PREFIX "cmpxchg8b %1\n" + : "+A" (old), "+m" (*ptr) + : "b" (low), "c" (high) + ); + return old; +} + +u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val) +{ + return cmpxchg8b(&ptr->counter, old_val, new_val); +} +EXPORT_SYMBOL(atomic64_cmpxchg); + +/** + * atomic64_xchg - xchg atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically xchgs the value of @ptr to @new_val and returns + * the old value. + */ +u64 atomic64_xchg(atomic64_t *ptr, u64 new_val) +{ + /* + * Try first with a (possibly incorrect) assumption about + * what we have there. We'll do two loops most likely, + * but we'll get an ownership MESI transaction straight away + * instead of a read transaction followed by a + * flush-for-ownership transaction: + */ + u64 old_val, real_val = 0; + + do { + old_val = real_val; + + real_val = atomic64_cmpxchg(ptr, old_val, new_val); + + } while (real_val != old_val); + + return old_val; +} +EXPORT_SYMBOL(atomic64_xchg); + +/** + * atomic64_set - set atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically sets the value of @ptr to @new_val. + */ +void atomic64_set(atomic64_t *ptr, u64 new_val) +{ + atomic64_xchg(ptr, new_val); +} +EXPORT_SYMBOL(atomic64_set); + +/** +EXPORT_SYMBOL(atomic64_read); + * atomic64_add_return - add and return + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns @delta + *@ptr + */ +noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr) +{ + /* + * Try first with a (possibly incorrect) assumption about + * what we have there. We'll do two loops most likely, + * but we'll get an ownership MESI transaction straight away + * instead of a read transaction followed by a + * flush-for-ownership transaction: + */ + u64 old_val, new_val, real_val = 0; + + do { + old_val = real_val; + new_val = old_val + delta; + + real_val = atomic64_cmpxchg(ptr, old_val, new_val); + + } while (real_val != old_val); + + return new_val; +} +EXPORT_SYMBOL(atomic64_add_return); + +u64 atomic64_sub_return(u64 delta, atomic64_t *ptr) +{ + return atomic64_add_return(-delta, ptr); +} +EXPORT_SYMBOL(atomic64_sub_return); + +u64 atomic64_inc_return(atomic64_t *ptr) +{ + return atomic64_add_return(1, ptr); +} +EXPORT_SYMBOL(atomic64_inc_return); + +u64 atomic64_dec_return(atomic64_t *ptr) +{ + return atomic64_sub_return(1, ptr); +} +EXPORT_SYMBOL(atomic64_dec_return); + +/** + * atomic64_add - add integer to atomic64 variable + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr. + */ +void atomic64_add(u64 delta, atomic64_t *ptr) +{ + atomic64_add_return(delta, ptr); +} +EXPORT_SYMBOL(atomic64_add); + +/** + * atomic64_sub - subtract the atomic64 variable + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr. + */ +void atomic64_sub(u64 delta, atomic64_t *ptr) +{ + atomic64_add(-delta, ptr); +} +EXPORT_SYMBOL(atomic64_sub); + +/** + * atomic64_sub_and_test - subtract value from variable and test result + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr and returns + * true if the result is zero, or false for all + * other cases. + */ +int atomic64_sub_and_test(u64 delta, atomic64_t *ptr) +{ + u64 new_val = atomic64_sub_return(delta, ptr); + + return new_val == 0; +} +EXPORT_SYMBOL(atomic64_sub_and_test); + +/** + * atomic64_inc - increment atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1. + */ +void atomic64_inc(atomic64_t *ptr) +{ + atomic64_add(1, ptr); +} +EXPORT_SYMBOL(atomic64_inc); + +/** + * atomic64_dec - decrement atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1. + */ +void atomic64_dec(atomic64_t *ptr) +{ + atomic64_sub(1, ptr); +} +EXPORT_SYMBOL(atomic64_dec); + +/** + * atomic64_dec_and_test - decrement and test + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +int atomic64_dec_and_test(atomic64_t *ptr) +{ + return atomic64_sub_and_test(1, ptr); +} +EXPORT_SYMBOL(atomic64_dec_and_test); + +/** + * atomic64_inc_and_test - increment and test + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +int atomic64_inc_and_test(atomic64_t *ptr) +{ + return atomic64_sub_and_test(-1, ptr); +} +EXPORT_SYMBOL(atomic64_inc_and_test); + +/** + * atomic64_add_negative - add and test if negative + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +int atomic64_add_negative(u64 delta, atomic64_t *ptr) +{ + s64 new_val = atomic64_add_return(delta, ptr); + + return new_val < 0; +} +EXPORT_SYMBOL(atomic64_add_negative); diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index 9a10a78bb4a..ebeafcce04a 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -5,15 +5,14 @@ * Zero a page. * rdi page */ - ALIGN -clear_page_c: +ENTRY(clear_page_c) CFI_STARTPROC movl $4096/8,%ecx xorl %eax,%eax rep stosq ret CFI_ENDPROC -ENDPROC(clear_page) +ENDPROC(clear_page_c) ENTRY(clear_page) CFI_STARTPROC diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index f118c110af3..6ba0f7bb85e 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -75,6 +75,7 @@ ENTRY(copy_to_user) jae bad_to_user ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC +ENDPROC(copy_to_user) /* Standard copy_from_user with segment limit checking */ ENTRY(copy_from_user) diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index f4568605d7d..ff485d36118 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -55,8 +55,10 @@ static void delay_tsc(unsigned long loops) preempt_disable(); cpu = smp_processor_id(); + rdtsc_barrier(); rdtscl(bclock); for (;;) { + rdtsc_barrier(); rdtscl(now); if ((now - bclock) >= loops) break; @@ -78,6 +80,7 @@ static void delay_tsc(unsigned long loops) if (unlikely(cpu != smp_processor_id())) { loops -= (now - bclock); cpu = smp_processor_id(); + rdtsc_barrier(); rdtscl(bclock); } } diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c deleted file mode 100644 index 321cf720dbb..00000000000 --- a/arch/x86/lib/msr-on-cpu.c +++ /dev/null @@ -1,97 +0,0 @@ -#include <linux/module.h> -#include <linux/preempt.h> -#include <linux/smp.h> -#include <asm/msr.h> - -struct msr_info { - u32 msr_no; - u32 l, h; - int err; -}; - -static void __rdmsr_on_cpu(void *info) -{ - struct msr_info *rv = info; - - rdmsr(rv->msr_no, rv->l, rv->h); -} - -static void __wrmsr_on_cpu(void *info) -{ - struct msr_info *rv = info; - - wrmsr(rv->msr_no, rv->l, rv->h); -} - -int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) -{ - int err; - struct msr_info rv; - - rv.msr_no = msr_no; - err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); - *l = rv.l; - *h = rv.h; - - return err; -} - -int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) -{ - int err; - struct msr_info rv; - - rv.msr_no = msr_no; - rv.l = l; - rv.h = h; - err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); - - return err; -} - -/* These "safe" variants are slower and should be used when the target MSR - may not actually exist. */ -static void __rdmsr_safe_on_cpu(void *info) -{ - struct msr_info *rv = info; - - rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h); -} - -static void __wrmsr_safe_on_cpu(void *info) -{ - struct msr_info *rv = info; - - rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h); -} - -int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) -{ - int err; - struct msr_info rv; - - rv.msr_no = msr_no; - err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); - *l = rv.l; - *h = rv.h; - - return err ? err : rv.err; -} - -int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) -{ - int err; - struct msr_info rv; - - rv.msr_no = msr_no; - rv.l = l; - rv.h = h; - err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); - - return err ? err : rv.err; -} - -EXPORT_SYMBOL(rdmsr_on_cpu); -EXPORT_SYMBOL(wrmsr_on_cpu); -EXPORT_SYMBOL(rdmsr_safe_on_cpu); -EXPORT_SYMBOL(wrmsr_safe_on_cpu); diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c new file mode 100644 index 00000000000..caa24aca811 --- /dev/null +++ b/arch/x86/lib/msr.c @@ -0,0 +1,177 @@ +#include <linux/module.h> +#include <linux/preempt.h> +#include <linux/smp.h> +#include <asm/msr.h> + +struct msr_info { + u32 msr_no; + struct msr reg; + struct msr *msrs; + int off; + int err; +}; + +static void __rdmsr_on_cpu(void *info) +{ + struct msr_info *rv = info; + struct msr *reg; + int this_cpu = raw_smp_processor_id(); + + if (rv->msrs) + reg = &rv->msrs[this_cpu - rv->off]; + else + reg = &rv->reg; + + rdmsr(rv->msr_no, reg->l, reg->h); +} + +static void __wrmsr_on_cpu(void *info) +{ + struct msr_info *rv = info; + struct msr *reg; + int this_cpu = raw_smp_processor_id(); + + if (rv->msrs) + reg = &rv->msrs[this_cpu - rv->off]; + else + reg = &rv->reg; + + wrmsr(rv->msr_no, reg->l, reg->h); +} + +int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); + *l = rv.reg.l; + *h = rv.reg.h; + + return err; +} +EXPORT_SYMBOL(rdmsr_on_cpu); + +int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + rv.reg.l = l; + rv.reg.h = h; + err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); + + return err; +} +EXPORT_SYMBOL(wrmsr_on_cpu); + +/* rdmsr on a bunch of CPUs + * + * @mask: which CPUs + * @msr_no: which MSR + * @msrs: array of MSR values + * + */ +void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) +{ + struct msr_info rv; + int this_cpu; + + memset(&rv, 0, sizeof(rv)); + + rv.off = cpumask_first(mask); + rv.msrs = msrs; + rv.msr_no = msr_no; + + this_cpu = get_cpu(); + + if (cpumask_test_cpu(this_cpu, mask)) + __rdmsr_on_cpu(&rv); + + smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1); + put_cpu(); +} +EXPORT_SYMBOL(rdmsr_on_cpus); + +/* + * wrmsr on a bunch of CPUs + * + * @mask: which CPUs + * @msr_no: which MSR + * @msrs: array of MSR values + * + */ +void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) +{ + struct msr_info rv; + int this_cpu; + + memset(&rv, 0, sizeof(rv)); + + rv.off = cpumask_first(mask); + rv.msrs = msrs; + rv.msr_no = msr_no; + + this_cpu = get_cpu(); + + if (cpumask_test_cpu(this_cpu, mask)) + __wrmsr_on_cpu(&rv); + + smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1); + put_cpu(); +} +EXPORT_SYMBOL(wrmsr_on_cpus); + +/* These "safe" variants are slower and should be used when the target MSR + may not actually exist. */ +static void __rdmsr_safe_on_cpu(void *info) +{ + struct msr_info *rv = info; + + rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h); +} + +static void __wrmsr_safe_on_cpu(void *info) +{ + struct msr_info *rv = info; + + rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h); +} + +int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); + *l = rv.reg.l; + *h = rv.reg.h; + + return err ? err : rv.err; +} +EXPORT_SYMBOL(rdmsr_safe_on_cpu); + +int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + rv.reg.l = l; + rv.reg.h = h; + err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(wrmsr_safe_on_cpu); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 7c8ca91bb9e..1f118d462ac 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -751,7 +751,7 @@ survive: if (retval == -ENOMEM && is_global_init(current)) { up_read(¤t->mm->mmap_sem); - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); goto survive; } diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index ec13cb5f17e..b7c2849ffb6 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -127,7 +127,7 @@ EXPORT_SYMBOL(__strnlen_user); long strnlen_user(const char __user *s, long n) { - if (!access_ok(VERIFY_READ, s, n)) + if (!access_ok(VERIFY_READ, s, 1)) return 0; return __strnlen_user(s, n); } diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index fdd30d08ab5..eefdeee8a87 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -10,6 +10,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o +obj-$(CONFIG_KMEMCHECK) += kmemcheck/ + obj-$(CONFIG_MMIOTRACE) += mmiotrace.o mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index e7277cbcfb4..a725b7f760a 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -161,13 +161,14 @@ static void note_page(struct seq_file *m, struct pg_state *st, st->current_address >= st->marker[1].start_address) { const char *unit = units; unsigned long delta; + int width = sizeof(unsigned long) * 2; /* * Now print the actual finished series */ - seq_printf(m, "0x%p-0x%p ", - (void *)st->start_address, - (void *)st->current_address); + seq_printf(m, "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, st->current_address); delta = (st->current_address - st->start_address) >> 10; while (!(delta & 1023) && unit[1]) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a03b7279efa..bfae139182f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -3,40 +3,18 @@ * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ -#include <linux/interrupt.h> -#include <linux/mmiotrace.h> -#include <linux/bootmem.h> -#include <linux/compiler.h> -#include <linux/highmem.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/vmalloc.h> -#include <linux/vt_kern.h> -#include <linux/signal.h> -#include <linux/kernel.h> -#include <linux/ptrace.h> -#include <linux/string.h> -#include <linux/module.h> -#include <linux/kdebug.h> -#include <linux/errno.h> -#include <linux/magic.h> -#include <linux/sched.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/mman.h> -#include <linux/tty.h> -#include <linux/smp.h> -#include <linux/mm.h> - -#include <asm-generic/sections.h> - -#include <asm/tlbflush.h> -#include <asm/pgalloc.h> -#include <asm/segment.h> -#include <asm/system.h> -#include <asm/proto.h> -#include <asm/traps.h> -#include <asm/desc.h> +#include <linux/magic.h> /* STACK_END_MAGIC */ +#include <linux/sched.h> /* test_thread_flag(), ... */ +#include <linux/kdebug.h> /* oops_begin/end, ... */ +#include <linux/module.h> /* search_exception_table */ +#include <linux/bootmem.h> /* max_low_pfn */ +#include <linux/kprobes.h> /* __kprobes, ... */ +#include <linux/mmiotrace.h> /* kmmio_handler, ... */ +#include <linux/perf_counter.h> /* perf_swcounter_event */ + +#include <asm/traps.h> /* dotraplinkage, ... */ +#include <asm/pgalloc.h> /* pgd_*(), ... */ +#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ /* * Page fault error code bits: @@ -225,12 +203,10 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) if (!pmd_present(*pmd_k)) return NULL; - if (!pmd_present(*pmd)) { + if (!pmd_present(*pmd)) set_pmd(pmd, *pmd_k); - arch_flush_lazy_mmu_mode(); - } else { + else BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - } return pmd_k; } @@ -450,10 +426,11 @@ static noinline int vmalloc_fault(unsigned long address) } static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; +KERN_ERR +"******* Your BIOS seems to not contain a fix for K8 errata #93\n" +"******* Working around it, but it may cause SEGVs or burn power.\n" +"******* Please consider a BIOS update.\n" +"******* Disabling USB legacy in the BIOS may also help.\n"; /* * No vm86 mode in 64-bit mode: @@ -538,8 +515,6 @@ bad: static int is_errata93(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - static int once; - if (address != regs->ip) return 0; @@ -549,10 +524,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!once) { - printk(errata93_warning); - once = 1; - } + printk_once(errata93_warning); regs->ip = address; return 1; } @@ -725,7 +697,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, if (!printk_ratelimit()) return; - printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); @@ -981,11 +953,17 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) tsk = current; mm = tsk->mm; - prefetchw(&mm->mmap_sem); - /* Get the faulting address: */ address = read_cr2(); + /* + * Detect and handle instructions that would cause a page fault for + * both a tracked kernel page and a userspace page. + */ + if (kmemcheck_active(regs)) + kmemcheck_hide(regs); + prefetchw(&mm->mmap_sem); + if (unlikely(kmmio_fault(regs, address))) return; @@ -1003,9 +981,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) * protection error (error_code & 9) == 0. */ if (unlikely(fault_in_kernel_space(address))) { - if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && - vmalloc_fault(address) >= 0) - return; + if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { + if (vmalloc_fault(address) >= 0) + return; + + if (kmemcheck_fault(regs, address, error_code)) + return; + } /* Can handle a stale RO->RW TLB: */ if (spurious_fault(error_code, address)) @@ -1044,6 +1026,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + /* * If we're in an interrupt, have no user context or are running * in an atomic region then we must not take the fault: @@ -1130,17 +1114,22 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault: */ - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } - if (fault & VM_FAULT_MAJOR) + if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - else + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + regs, address); + } else { tsk->min_flt++; + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + regs, address); + } check_v8086_mode(regs, address, tsk); diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 6340cef6798..71da1bca13c 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -14,7 +14,7 @@ static inline pte_t gup_get_pte(pte_t *ptep) { #ifndef CONFIG_X86_PAE - return *ptep; + return ACCESS_ONCE(*ptep); #else /* * With get_user_pages_fast, we walk down the pagetables without taking @@ -219,6 +219,62 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, return 1; } +/* + * Like get_user_pages_fast() except its IRQ-safe in that it won't fall + * back to the regular GUP. + */ +int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next; + unsigned long flags; + pgd_t *pgdp; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + (void __user *)start, len))) + return 0; + + /* + * XXX: batch / limit 'nr', to avoid large irq off latency + * needs some instrumenting to determine the common sizes used by + * important workloads (eg. DB2), and whether limiting the batch size + * will decrease performance. + * + * It seems like we're in the clear for the moment. Direct-IO is + * the main guy that batches up lots of get_user_pages, and even + * they are limited to 64-at-a-time which is not so many. + */ + /* + * This doesn't prevent pagetable teardown, but does prevent + * the pagetables and pages from being freed on x86. + * + * So long as we atomically load page table pointers versus teardown + * (which we do on x86, with the above PAE exception), we can follow the + * address down to the the page and take a ref on it. + */ + local_irq_save(flags); + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + break; + if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + break; + } while (pgdp++, addr = next, addr != end); + local_irq_restore(flags); + + return nr; +} + /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address @@ -247,11 +303,16 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, start &= PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; - if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - (void __user *)start, len))) + if (end < start) goto slow_irqon; +#ifdef CONFIG_X86_64 + if (end >> __VIRTUAL_MASK_SHIFT) + goto slow_irqon; +#endif + /* * XXX: batch / limit 'nr', to avoid large irq off latency * needs some instrumenting to determine the common sizes used by diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 8126e8d1a2a..2112ed55e7e 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -44,7 +44,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); set_pte(kmap_pte-idx, mk_pte(page, prot)); - arch_flush_lazy_mmu_mode(); return (void *)vaddr; } @@ -74,7 +73,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type) #endif } - arch_flush_lazy_mmu_mode(); pagefault_enable(); } @@ -105,6 +103,7 @@ EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); +EXPORT_SYMBOL(kmap_atomic_prot); void __init set_highmem_pages_init(void) { diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index ae4f7b5d710..0607119cef9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1,3 +1,4 @@ +#include <linux/initrd.h> #include <linux/ioport.h> #include <linux/swap.h> @@ -10,6 +11,10 @@ #include <asm/setup.h> #include <asm/system.h> #include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/proto.h> + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long __initdata e820_table_start; unsigned long __meminitdata e820_table_end; @@ -23,6 +28,69 @@ int direct_gbpages #endif ; +int nx_enabled; + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +static int disable_nx __cpuinitdata; + +/* + * noexec = on|off + * + * Control non-executable mappings for processes. + * + * on Enable + * off Disable + */ +static int __init noexec_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + disable_nx = 0; + } else if (!strncmp(str, "off", 3)) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; +} +early_param("noexec", noexec_setup); +#endif + +#ifdef CONFIG_X86_PAE +static void __init set_nx(void) +{ + unsigned int v[4], l, h; + + if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { + cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); + + if ((v[3] & (1 << 20)) && !disable_nx) { + rdmsr(MSR_EFER, l, h); + l |= EFER_NX; + wrmsr(MSR_EFER, l, h); + nx_enabled = 1; + __supported_pte_mask |= _PAGE_NX; + } + } +} +#else +static inline void set_nx(void) +{ +} +#endif + +#ifdef CONFIG_X86_64 +void __cpuinit check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || disable_nx) + __supported_pte_mask &= ~_PAGE_NX; +} +#endif + static void __init find_early_table_space(unsigned long end, int use_pse, int use_gbpages) { @@ -66,12 +134,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse, */ #ifdef CONFIG_X86_32 start = 0x7000; - e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, - tables, PAGE_SIZE); -#else /* CONFIG_X86_64 */ +#else start = 0x8000; - e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE); #endif + e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, + tables, PAGE_SIZE); if (e820_table_start == -1UL) panic("Cannot find space for the kernel page tables"); @@ -111,20 +178,6 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, return nr_range; } -#ifdef CONFIG_X86_64 -static void __init init_gbpages(void) -{ - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; -} -#else -static inline void init_gbpages(void) -{ -} -#endif - /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from @@ -144,10 +197,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); - if (!after_bootmem) - init_gbpages(); - -#ifdef CONFIG_DEBUG_PAGEALLOC +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) /* * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. * This will simplify cpa(), which otherwise needs to support splitting @@ -159,12 +209,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, use_gbpages = direct_gbpages; #endif -#ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_PAE set_nx(); if (nx_enabled) printk(KERN_INFO "NX (Execute Disable) protection: active\n"); -#endif /* Enable PSE if available */ if (cpu_has_pse) @@ -175,7 +222,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, set_in_cr4(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } -#endif if (use_gbpages) page_size_mask |= 1 << PG_LEVEL_1G; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 749559ed80f..3cd7711bb94 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -49,12 +49,9 @@ #include <asm/paravirt.h> #include <asm/setup.h> #include <asm/cacheflush.h> +#include <asm/page_types.h> #include <asm/init.h> -unsigned long max_low_pfn_mapped; -unsigned long max_pfn_mapped; - -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); @@ -114,7 +111,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) pte_t *page_table = NULL; if (after_bootmem) { -#ifdef CONFIG_DEBUG_PAGEALLOC +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); #endif if (!page_table) @@ -567,7 +564,7 @@ static inline void save_pg_dir(void) } #endif /* !CONFIG_ACPI_SLEEP */ -void zap_low_mappings(void) +void zap_low_mappings(bool early) { int i; @@ -584,64 +581,16 @@ void zap_low_mappings(void) set_pgd(swapper_pg_dir+i, __pgd(0)); #endif } - flush_tlb_all(); -} -int nx_enabled; + if (early) + __flush_tlb(); + else + flush_tlb_all(); +} pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); EXPORT_SYMBOL_GPL(__supported_pte_mask); -#ifdef CONFIG_X86_PAE - -static int disable_nx __initdata; - -/* - * noexec = on|off - * - * Control non executable mappings. - * - * on Enable - * off Disable - */ -static int __init noexec_setup(char *str) -{ - if (!str || !strcmp(str, "on")) { - if (cpu_has_nx) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } - } else { - if (!strcmp(str, "off")) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } else { - return -EINVAL; - } - } - - return 0; -} -early_param("noexec", noexec_setup); - -void __init set_nx(void) -{ - unsigned int v[4], l, h; - - if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { - cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); - - if ((v[3] & (1 << 20)) && !disable_nx) { - rdmsr(MSR_EFER, l, h); - l |= EFER_NX; - wrmsr(MSR_EFER, l, h); - nx_enabled = 1; - __supported_pte_mask |= _PAGE_NX; - } - } -} -#endif - /* user-defined highmem size */ static unsigned int highmem_pages = -1; @@ -761,15 +710,15 @@ void __init initmem_init(unsigned long start_pfn, highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) highstart_pfn = max_low_pfn; - memory_present(0, 0, highend_pfn); e820_register_active_regions(0, 0, highend_pfn); + sparse_memory_present_with_active_regions(0); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - memory_present(0, 0, max_low_pfn); e820_register_active_regions(0, 0, max_low_pfn); + sparse_memory_present_with_active_regions(0); num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif @@ -1011,7 +960,7 @@ void __init mem_init(void) test_wp_bit(); save_pg_dir(); - zap_low_mappings(); + zap_low_mappings(true); } #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1753e8020df..ea56b8cbb6a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -50,18 +50,8 @@ #include <asm/cacheflush.h> #include <asm/init.h> -/* - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. - * The direct mapping extends to max_pfn_mapped, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. - */ -unsigned long max_low_pfn_mapped; -unsigned long max_pfn_mapped; - static unsigned long dma_reserve __initdata; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; @@ -85,39 +75,6 @@ early_param("gbpages", parse_direct_gbpages_on); pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; EXPORT_SYMBOL_GPL(__supported_pte_mask); -static int disable_nx __cpuinitdata; - -/* - * noexec=on|off - * Control non-executable mappings for 64-bit processes. - * - * on Enable (default) - * off Disable - */ -static int __init nonx_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } else if (!strncmp(str, "off", 3)) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", nonx_setup); - -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || disable_nx) - __supported_pte_mask &= ~_PAGE_NX; -} - int force_personality32; /* @@ -147,7 +104,7 @@ static __ref void *spp_getpage(void) void *ptr; if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC); + ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); else ptr = alloc_bootmem_pages(PAGE_SIZE); @@ -324,7 +281,7 @@ static __ref void *alloc_low_page(unsigned long *phys) void *adr; if (after_bootmem) { - adr = (void *)get_zeroed_page(GFP_ATOMIC); + adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); *phys = __pa(adr); return adr; @@ -570,7 +527,7 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, return phys_pud_init(pud, addr, end, page_size_mask); } -unsigned long __init +unsigned long __meminit kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) @@ -628,6 +585,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); } +#endif void __init paging_init(void) { @@ -638,11 +596,19 @@ void __init paging_init(void) max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; max_zone_pfns[ZONE_NORMAL] = max_pfn; - memory_present(0, 0, max_pfn); + sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); + + /* + * clear the default setting with node 0 + * note: don't use nodes_clear here, that is really clearing when + * numa support is not compiled in, and later node_set_state + * will not set it back. + */ + node_clear_state(0, N_NORMAL_MEMORY); + free_area_init_nodes(max_zone_pfns); } -#endif /* * Memory hotplug specific functions @@ -830,7 +796,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, return ret; #else - reserve_bootmem(phys, len, BOOTMEM_DEFAULT); + reserve_bootmem(phys, len, flags); #endif if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 8056545e2d3..fe6f84ca121 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -82,7 +82,6 @@ iounmap_atomic(void *kvaddr, enum km_type type) if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) kpte_clear_flush(kmap_pte-idx, vaddr); - arch_flush_lazy_mmu_mode(); pagefault_enable(); } EXPORT_SYMBOL_GPL(iounmap_atomic); diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile new file mode 100644 index 00000000000..520b3bce409 --- /dev/null +++ b/arch/x86/mm/kmemcheck/Makefile @@ -0,0 +1 @@ +obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c new file mode 100644 index 00000000000..4901d0dafda --- /dev/null +++ b/arch/x86/mm/kmemcheck/error.c @@ -0,0 +1,228 @@ +#include <linux/interrupt.h> +#include <linux/kdebug.h> +#include <linux/kmemcheck.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/stacktrace.h> +#include <linux/string.h> + +#include "error.h" +#include "shadow.h" + +enum kmemcheck_error_type { + KMEMCHECK_ERROR_INVALID_ACCESS, + KMEMCHECK_ERROR_BUG, +}; + +#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT) + +struct kmemcheck_error { + enum kmemcheck_error_type type; + + union { + /* KMEMCHECK_ERROR_INVALID_ACCESS */ + struct { + /* Kind of access that caused the error */ + enum kmemcheck_shadow state; + /* Address and size of the erroneous read */ + unsigned long address; + unsigned int size; + }; + }; + + struct pt_regs regs; + struct stack_trace trace; + unsigned long trace_entries[32]; + + /* We compress it to a char. */ + unsigned char shadow_copy[SHADOW_COPY_SIZE]; + unsigned char memory_copy[SHADOW_COPY_SIZE]; +}; + +/* + * Create a ring queue of errors to output. We can't call printk() directly + * from the kmemcheck traps, since this may call the console drivers and + * result in a recursive fault. + */ +static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE]; +static unsigned int error_count; +static unsigned int error_rd; +static unsigned int error_wr; +static unsigned int error_missed_count; + +static struct kmemcheck_error *error_next_wr(void) +{ + struct kmemcheck_error *e; + + if (error_count == ARRAY_SIZE(error_fifo)) { + ++error_missed_count; + return NULL; + } + + e = &error_fifo[error_wr]; + if (++error_wr == ARRAY_SIZE(error_fifo)) + error_wr = 0; + ++error_count; + return e; +} + +static struct kmemcheck_error *error_next_rd(void) +{ + struct kmemcheck_error *e; + + if (error_count == 0) + return NULL; + + e = &error_fifo[error_rd]; + if (++error_rd == ARRAY_SIZE(error_fifo)) + error_rd = 0; + --error_count; + return e; +} + +void kmemcheck_error_recall(void) +{ + static const char *desc[] = { + [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated", + [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized", + [KMEMCHECK_SHADOW_INITIALIZED] = "initialized", + [KMEMCHECK_SHADOW_FREED] = "freed", + }; + + static const char short_desc[] = { + [KMEMCHECK_SHADOW_UNALLOCATED] = 'a', + [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u', + [KMEMCHECK_SHADOW_INITIALIZED] = 'i', + [KMEMCHECK_SHADOW_FREED] = 'f', + }; + + struct kmemcheck_error *e; + unsigned int i; + + e = error_next_rd(); + if (!e) + return; + + switch (e->type) { + case KMEMCHECK_ERROR_INVALID_ACCESS: + printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read " + "from %s memory (%p)\n", + 8 * e->size, e->state < ARRAY_SIZE(desc) ? + desc[e->state] : "(invalid shadow state)", + (void *) e->address); + + printk(KERN_INFO); + for (i = 0; i < SHADOW_COPY_SIZE; ++i) + printk("%02x", e->memory_copy[i]); + printk("\n"); + + printk(KERN_INFO); + for (i = 0; i < SHADOW_COPY_SIZE; ++i) { + if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) + printk(" %c", short_desc[e->shadow_copy[i]]); + else + printk(" ?"); + } + printk("\n"); + printk(KERN_INFO "%*c\n", 2 + 2 + * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); + break; + case KMEMCHECK_ERROR_BUG: + printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n"); + break; + } + + __show_regs(&e->regs, 1); + print_stack_trace(&e->trace, 0); +} + +static void do_wakeup(unsigned long data) +{ + while (error_count > 0) + kmemcheck_error_recall(); + + if (error_missed_count > 0) { + printk(KERN_WARNING "kmemcheck: Lost %d error reports because " + "the queue was too small\n", error_missed_count); + error_missed_count = 0; + } +} + +static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0); + +/* + * Save the context of an error report. + */ +void kmemcheck_error_save(enum kmemcheck_shadow state, + unsigned long address, unsigned int size, struct pt_regs *regs) +{ + static unsigned long prev_ip; + + struct kmemcheck_error *e; + void *shadow_copy; + void *memory_copy; + + /* Don't report several adjacent errors from the same EIP. */ + if (regs->ip == prev_ip) + return; + prev_ip = regs->ip; + + e = error_next_wr(); + if (!e) + return; + + e->type = KMEMCHECK_ERROR_INVALID_ACCESS; + + e->state = state; + e->address = address; + e->size = size; + + /* Save regs */ + memcpy(&e->regs, regs, sizeof(*regs)); + + /* Save stack trace */ + e->trace.nr_entries = 0; + e->trace.entries = e->trace_entries; + e->trace.max_entries = ARRAY_SIZE(e->trace_entries); + e->trace.skip = 0; + save_stack_trace_bp(&e->trace, regs->bp); + + /* Round address down to nearest 16 bytes */ + shadow_copy = kmemcheck_shadow_lookup(address + & ~(SHADOW_COPY_SIZE - 1)); + BUG_ON(!shadow_copy); + + memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE); + + kmemcheck_show_addr(address); + memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1)); + memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE); + kmemcheck_hide_addr(address); + + tasklet_hi_schedule_first(&kmemcheck_tasklet); +} + +/* + * Save the context of a kmemcheck bug. + */ +void kmemcheck_error_save_bug(struct pt_regs *regs) +{ + struct kmemcheck_error *e; + + e = error_next_wr(); + if (!e) + return; + + e->type = KMEMCHECK_ERROR_BUG; + + memcpy(&e->regs, regs, sizeof(*regs)); + + e->trace.nr_entries = 0; + e->trace.entries = e->trace_entries; + e->trace.max_entries = ARRAY_SIZE(e->trace_entries); + e->trace.skip = 1; + save_stack_trace(&e->trace); + + tasklet_hi_schedule_first(&kmemcheck_tasklet); +} diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h new file mode 100644 index 00000000000..0efc2e8d0a2 --- /dev/null +++ b/arch/x86/mm/kmemcheck/error.h @@ -0,0 +1,15 @@ +#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H +#define ARCH__X86__MM__KMEMCHECK__ERROR_H + +#include <linux/ptrace.h> + +#include "shadow.h" + +void kmemcheck_error_save(enum kmemcheck_shadow state, + unsigned long address, unsigned int size, struct pt_regs *regs); + +void kmemcheck_error_save_bug(struct pt_regs *regs); + +void kmemcheck_error_recall(void); + +#endif diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c new file mode 100644 index 00000000000..2c55ed09865 --- /dev/null +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -0,0 +1,640 @@ +/** + * kmemcheck - a heavyweight memory checker for the linux kernel + * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no> + * (With a lot of help from Ingo Molnar and Pekka Enberg.) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2) as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/kallsyms.h> +#include <linux/kernel.h> +#include <linux/kmemcheck.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/page-flags.h> +#include <linux/percpu.h> +#include <linux/ptrace.h> +#include <linux/string.h> +#include <linux/types.h> + +#include <asm/cacheflush.h> +#include <asm/kmemcheck.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> + +#include "error.h" +#include "opcode.h" +#include "pte.h" +#include "selftest.h" +#include "shadow.h" + + +#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT +# define KMEMCHECK_ENABLED 0 +#endif + +#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT +# define KMEMCHECK_ENABLED 1 +#endif + +#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT +# define KMEMCHECK_ENABLED 2 +#endif + +int kmemcheck_enabled = KMEMCHECK_ENABLED; + +int __init kmemcheck_init(void) +{ +#ifdef CONFIG_SMP + /* + * Limit SMP to use a single CPU. We rely on the fact that this code + * runs before SMP is set up. + */ + if (setup_max_cpus > 1) { + printk(KERN_INFO + "kmemcheck: Limiting number of CPUs to 1.\n"); + setup_max_cpus = 1; + } +#endif + + if (!kmemcheck_selftest()) { + printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n"); + kmemcheck_enabled = 0; + return -EINVAL; + } + + printk(KERN_INFO "kmemcheck: Initialized\n"); + return 0; +} + +early_initcall(kmemcheck_init); + +/* + * We need to parse the kmemcheck= option before any memory is allocated. + */ +static int __init param_kmemcheck(char *str) +{ + if (!str) + return -EINVAL; + + sscanf(str, "%d", &kmemcheck_enabled); + return 0; +} + +early_param("kmemcheck", param_kmemcheck); + +int kmemcheck_show_addr(unsigned long address) +{ + pte_t *pte; + + pte = kmemcheck_pte_lookup(address); + if (!pte) + return 0; + + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); + __flush_tlb_one(address); + return 1; +} + +int kmemcheck_hide_addr(unsigned long address) +{ + pte_t *pte; + + pte = kmemcheck_pte_lookup(address); + if (!pte) + return 0; + + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); + __flush_tlb_one(address); + return 1; +} + +struct kmemcheck_context { + bool busy; + int balance; + + /* + * There can be at most two memory operands to an instruction, but + * each address can cross a page boundary -- so we may need up to + * four addresses that must be hidden/revealed for each fault. + */ + unsigned long addr[4]; + unsigned long n_addrs; + unsigned long flags; + + /* Data size of the instruction that caused a fault. */ + unsigned int size; +}; + +static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context); + +bool kmemcheck_active(struct pt_regs *regs) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + + return data->balance > 0; +} + +/* Save an address that needs to be shown/hidden */ +static void kmemcheck_save_addr(unsigned long addr) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + + BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr)); + data->addr[data->n_addrs++] = addr; +} + +static unsigned int kmemcheck_show_all(void) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + unsigned int i; + unsigned int n; + + n = 0; + for (i = 0; i < data->n_addrs; ++i) + n += kmemcheck_show_addr(data->addr[i]); + + return n; +} + +static unsigned int kmemcheck_hide_all(void) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + unsigned int i; + unsigned int n; + + n = 0; + for (i = 0; i < data->n_addrs; ++i) + n += kmemcheck_hide_addr(data->addr[i]); + + return n; +} + +/* + * Called from the #PF handler. + */ +void kmemcheck_show(struct pt_regs *regs) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + + BUG_ON(!irqs_disabled()); + + if (unlikely(data->balance != 0)) { + kmemcheck_show_all(); + kmemcheck_error_save_bug(regs); + data->balance = 0; + return; + } + + /* + * None of the addresses actually belonged to kmemcheck. Note that + * this is not an error. + */ + if (kmemcheck_show_all() == 0) + return; + + ++data->balance; + + /* + * The IF needs to be cleared as well, so that the faulting + * instruction can run "uninterrupted". Otherwise, we might take + * an interrupt and start executing that before we've had a chance + * to hide the page again. + * + * NOTE: In the rare case of multiple faults, we must not override + * the original flags: + */ + if (!(regs->flags & X86_EFLAGS_TF)) + data->flags = regs->flags; + + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; +} + +/* + * Called from the #DB handler. + */ +void kmemcheck_hide(struct pt_regs *regs) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + int n; + + BUG_ON(!irqs_disabled()); + + if (data->balance == 0) + return; + + if (unlikely(data->balance != 1)) { + kmemcheck_show_all(); + kmemcheck_error_save_bug(regs); + data->n_addrs = 0; + data->balance = 0; + + if (!(data->flags & X86_EFLAGS_TF)) + regs->flags &= ~X86_EFLAGS_TF; + if (data->flags & X86_EFLAGS_IF) + regs->flags |= X86_EFLAGS_IF; + return; + } + + if (kmemcheck_enabled) + n = kmemcheck_hide_all(); + else + n = kmemcheck_show_all(); + + if (n == 0) + return; + + --data->balance; + + data->n_addrs = 0; + + if (!(data->flags & X86_EFLAGS_TF)) + regs->flags &= ~X86_EFLAGS_TF; + if (data->flags & X86_EFLAGS_IF) + regs->flags |= X86_EFLAGS_IF; +} + +void kmemcheck_show_pages(struct page *p, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; ++i) { + unsigned long address; + pte_t *pte; + unsigned int level; + + address = (unsigned long) page_address(&p[i]); + pte = lookup_address(address, &level); + BUG_ON(!pte); + BUG_ON(level != PG_LEVEL_4K); + + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN)); + __flush_tlb_one(address); + } +} + +bool kmemcheck_page_is_tracked(struct page *p) +{ + /* This will also check the "hidden" flag of the PTE. */ + return kmemcheck_pte_lookup((unsigned long) page_address(p)); +} + +void kmemcheck_hide_pages(struct page *p, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; ++i) { + unsigned long address; + pte_t *pte; + unsigned int level; + + address = (unsigned long) page_address(&p[i]); + pte = lookup_address(address, &level); + BUG_ON(!pte); + BUG_ON(level != PG_LEVEL_4K); + + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); + set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN)); + __flush_tlb_one(address); + } +} + +/* Access may NOT cross page boundary */ +static void kmemcheck_read_strict(struct pt_regs *regs, + unsigned long addr, unsigned int size) +{ + void *shadow; + enum kmemcheck_shadow status; + + shadow = kmemcheck_shadow_lookup(addr); + if (!shadow) + return; + + kmemcheck_save_addr(addr); + status = kmemcheck_shadow_test(shadow, size); + if (status == KMEMCHECK_SHADOW_INITIALIZED) + return; + + if (kmemcheck_enabled) + kmemcheck_error_save(status, addr, size, regs); + + if (kmemcheck_enabled == 2) + kmemcheck_enabled = 0; + + /* Don't warn about it again. */ + kmemcheck_shadow_set(shadow, size); +} + +/* Access may cross page boundary */ +static void kmemcheck_read(struct pt_regs *regs, + unsigned long addr, unsigned int size) +{ + unsigned long page = addr & PAGE_MASK; + unsigned long next_addr = addr + size - 1; + unsigned long next_page = next_addr & PAGE_MASK; + + if (likely(page == next_page)) { + kmemcheck_read_strict(regs, addr, size); + return; + } + + /* + * What we do is basically to split the access across the + * two pages and handle each part separately. Yes, this means + * that we may now see reads that are 3 + 5 bytes, for + * example (and if both are uninitialized, there will be two + * reports), but it makes the code a lot simpler. + */ + kmemcheck_read_strict(regs, addr, next_page - addr); + kmemcheck_read_strict(regs, next_page, next_addr - next_page); +} + +static void kmemcheck_write_strict(struct pt_regs *regs, + unsigned long addr, unsigned int size) +{ + void *shadow; + + shadow = kmemcheck_shadow_lookup(addr); + if (!shadow) + return; + + kmemcheck_save_addr(addr); + kmemcheck_shadow_set(shadow, size); +} + +static void kmemcheck_write(struct pt_regs *regs, + unsigned long addr, unsigned int size) +{ + unsigned long page = addr & PAGE_MASK; + unsigned long next_addr = addr + size - 1; + unsigned long next_page = next_addr & PAGE_MASK; + + if (likely(page == next_page)) { + kmemcheck_write_strict(regs, addr, size); + return; + } + + /* See comment in kmemcheck_read(). */ + kmemcheck_write_strict(regs, addr, next_page - addr); + kmemcheck_write_strict(regs, next_page, next_addr - next_page); +} + +/* + * Copying is hard. We have two addresses, each of which may be split across + * a page (and each page will have different shadow addresses). + */ +static void kmemcheck_copy(struct pt_regs *regs, + unsigned long src_addr, unsigned long dst_addr, unsigned int size) +{ + uint8_t shadow[8]; + enum kmemcheck_shadow status; + + unsigned long page; + unsigned long next_addr; + unsigned long next_page; + + uint8_t *x; + unsigned int i; + unsigned int n; + + BUG_ON(size > sizeof(shadow)); + + page = src_addr & PAGE_MASK; + next_addr = src_addr + size - 1; + next_page = next_addr & PAGE_MASK; + + if (likely(page == next_page)) { + /* Same page */ + x = kmemcheck_shadow_lookup(src_addr); + if (x) { + kmemcheck_save_addr(src_addr); + for (i = 0; i < size; ++i) + shadow[i] = x[i]; + } else { + for (i = 0; i < size; ++i) + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + } else { + n = next_page - src_addr; + BUG_ON(n > sizeof(shadow)); + + /* First page */ + x = kmemcheck_shadow_lookup(src_addr); + if (x) { + kmemcheck_save_addr(src_addr); + for (i = 0; i < n; ++i) + shadow[i] = x[i]; + } else { + /* Not tracked */ + for (i = 0; i < n; ++i) + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + + /* Second page */ + x = kmemcheck_shadow_lookup(next_page); + if (x) { + kmemcheck_save_addr(next_page); + for (i = n; i < size; ++i) + shadow[i] = x[i - n]; + } else { + /* Not tracked */ + for (i = n; i < size; ++i) + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + } + + page = dst_addr & PAGE_MASK; + next_addr = dst_addr + size - 1; + next_page = next_addr & PAGE_MASK; + + if (likely(page == next_page)) { + /* Same page */ + x = kmemcheck_shadow_lookup(dst_addr); + if (x) { + kmemcheck_save_addr(dst_addr); + for (i = 0; i < size; ++i) { + x[i] = shadow[i]; + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + } + } else { + n = next_page - dst_addr; + BUG_ON(n > sizeof(shadow)); + + /* First page */ + x = kmemcheck_shadow_lookup(dst_addr); + if (x) { + kmemcheck_save_addr(dst_addr); + for (i = 0; i < n; ++i) { + x[i] = shadow[i]; + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + } + + /* Second page */ + x = kmemcheck_shadow_lookup(next_page); + if (x) { + kmemcheck_save_addr(next_page); + for (i = n; i < size; ++i) { + x[i - n] = shadow[i]; + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + } + } + + status = kmemcheck_shadow_test(shadow, size); + if (status == KMEMCHECK_SHADOW_INITIALIZED) + return; + + if (kmemcheck_enabled) + kmemcheck_error_save(status, src_addr, size, regs); + + if (kmemcheck_enabled == 2) + kmemcheck_enabled = 0; +} + +enum kmemcheck_method { + KMEMCHECK_READ, + KMEMCHECK_WRITE, +}; + +static void kmemcheck_access(struct pt_regs *regs, + unsigned long fallback_address, enum kmemcheck_method fallback_method) +{ + const uint8_t *insn; + const uint8_t *insn_primary; + unsigned int size; + + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + + /* Recursive fault -- ouch. */ + if (data->busy) { + kmemcheck_show_addr(fallback_address); + kmemcheck_error_save_bug(regs); + return; + } + + data->busy = true; + + insn = (const uint8_t *) regs->ip; + insn_primary = kmemcheck_opcode_get_primary(insn); + + kmemcheck_opcode_decode(insn, &size); + + switch (insn_primary[0]) { +#ifdef CONFIG_KMEMCHECK_BITOPS_OK + /* AND, OR, XOR */ + /* + * Unfortunately, these instructions have to be excluded from + * our regular checking since they access only some (and not + * all) bits. This clears out "bogus" bitfield-access warnings. + */ + case 0x80: + case 0x81: + case 0x82: + case 0x83: + switch ((insn_primary[1] >> 3) & 7) { + /* OR */ + case 1: + /* AND */ + case 4: + /* XOR */ + case 6: + kmemcheck_write(regs, fallback_address, size); + goto out; + + /* ADD */ + case 0: + /* ADC */ + case 2: + /* SBB */ + case 3: + /* SUB */ + case 5: + /* CMP */ + case 7: + break; + } + break; +#endif + + /* MOVS, MOVSB, MOVSW, MOVSD */ + case 0xa4: + case 0xa5: + /* + * These instructions are special because they take two + * addresses, but we only get one page fault. + */ + kmemcheck_copy(regs, regs->si, regs->di, size); + goto out; + + /* CMPS, CMPSB, CMPSW, CMPSD */ + case 0xa6: + case 0xa7: + kmemcheck_read(regs, regs->si, size); + kmemcheck_read(regs, regs->di, size); + goto out; + } + + /* + * If the opcode isn't special in any way, we use the data from the + * page fault handler to determine the address and type of memory + * access. + */ + switch (fallback_method) { + case KMEMCHECK_READ: + kmemcheck_read(regs, fallback_address, size); + goto out; + case KMEMCHECK_WRITE: + kmemcheck_write(regs, fallback_address, size); + goto out; + } + +out: + data->busy = false; +} + +bool kmemcheck_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code) +{ + pte_t *pte; + + /* + * XXX: Is it safe to assume that memory accesses from virtual 86 + * mode or non-kernel code segments will _never_ access kernel + * memory (e.g. tracked pages)? For now, we need this to avoid + * invoking kmemcheck for PnP BIOS calls. + */ + if (regs->flags & X86_VM_MASK) + return false; + if (regs->cs != __KERNEL_CS) + return false; + + pte = kmemcheck_pte_lookup(address); + if (!pte) + return false; + + if (error_code & 2) + kmemcheck_access(regs, address, KMEMCHECK_WRITE); + else + kmemcheck_access(regs, address, KMEMCHECK_READ); + + kmemcheck_show(regs); + return true; +} + +bool kmemcheck_trap(struct pt_regs *regs) +{ + if (!kmemcheck_active(regs)) + return false; + + /* We're done. */ + kmemcheck_hide(regs); + return true; +} diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c new file mode 100644 index 00000000000..63c19e27aa6 --- /dev/null +++ b/arch/x86/mm/kmemcheck/opcode.c @@ -0,0 +1,106 @@ +#include <linux/types.h> + +#include "opcode.h" + +static bool opcode_is_prefix(uint8_t b) +{ + return + /* Group 1 */ + b == 0xf0 || b == 0xf2 || b == 0xf3 + /* Group 2 */ + || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 + || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e + /* Group 3 */ + || b == 0x66 + /* Group 4 */ + || b == 0x67; +} + +#ifdef CONFIG_X86_64 +static bool opcode_is_rex_prefix(uint8_t b) +{ + return (b & 0xf0) == 0x40; +} +#else +static bool opcode_is_rex_prefix(uint8_t b) +{ + return false; +} +#endif + +#define REX_W (1 << 3) + +/* + * This is a VERY crude opcode decoder. We only need to find the size of the + * load/store that caused our #PF and this should work for all the opcodes + * that we care about. Moreover, the ones who invented this instruction set + * should be shot. + */ +void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size) +{ + /* Default operand size */ + int operand_size_override = 4; + + /* prefixes */ + for (; opcode_is_prefix(*op); ++op) { + if (*op == 0x66) + operand_size_override = 2; + } + + /* REX prefix */ + if (opcode_is_rex_prefix(*op)) { + uint8_t rex = *op; + + ++op; + if (rex & REX_W) { + switch (*op) { + case 0x63: + *size = 4; + return; + case 0x0f: + ++op; + + switch (*op) { + case 0xb6: + case 0xbe: + *size = 1; + return; + case 0xb7: + case 0xbf: + *size = 2; + return; + } + + break; + } + + *size = 8; + return; + } + } + + /* escape opcode */ + if (*op == 0x0f) { + ++op; + + /* + * This is move with zero-extend and sign-extend, respectively; + * we don't have to think about 0xb6/0xbe, because this is + * already handled in the conditional below. + */ + if (*op == 0xb7 || *op == 0xbf) + operand_size_override = 2; + } + + *size = (*op & 1) ? operand_size_override : 1; +} + +const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op) +{ + /* skip prefixes */ + while (opcode_is_prefix(*op)) + ++op; + if (opcode_is_rex_prefix(*op)) + ++op; + return op; +} diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h new file mode 100644 index 00000000000..6956aad66b5 --- /dev/null +++ b/arch/x86/mm/kmemcheck/opcode.h @@ -0,0 +1,9 @@ +#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H +#define ARCH__X86__MM__KMEMCHECK__OPCODE_H + +#include <linux/types.h> + +void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size); +const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op); + +#endif diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c new file mode 100644 index 00000000000..4ead26eeaf9 --- /dev/null +++ b/arch/x86/mm/kmemcheck/pte.c @@ -0,0 +1,22 @@ +#include <linux/mm.h> + +#include <asm/pgtable.h> + +#include "pte.h" + +pte_t *kmemcheck_pte_lookup(unsigned long address) +{ + pte_t *pte; + unsigned int level; + + pte = lookup_address(address, &level); + if (!pte) + return NULL; + if (level != PG_LEVEL_4K) + return NULL; + if (!pte_hidden(*pte)) + return NULL; + + return pte; +} + diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h new file mode 100644 index 00000000000..9f596645649 --- /dev/null +++ b/arch/x86/mm/kmemcheck/pte.h @@ -0,0 +1,10 @@ +#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H +#define ARCH__X86__MM__KMEMCHECK__PTE_H + +#include <linux/mm.h> + +#include <asm/pgtable.h> + +pte_t *kmemcheck_pte_lookup(unsigned long address); + +#endif diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c new file mode 100644 index 00000000000..036efbea8b2 --- /dev/null +++ b/arch/x86/mm/kmemcheck/selftest.c @@ -0,0 +1,69 @@ +#include <linux/kernel.h> + +#include "opcode.h" +#include "selftest.h" + +struct selftest_opcode { + unsigned int expected_size; + const uint8_t *insn; + const char *desc; +}; + +static const struct selftest_opcode selftest_opcodes[] = { + /* REP MOVS */ + {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"}, + {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"}, + + /* MOVZX / MOVZXD */ + {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"}, + {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"}, + + /* MOVSX / MOVSXD */ + {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"}, + {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"}, + +#ifdef CONFIG_X86_64 + /* MOVZX / MOVZXD */ + {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"}, + {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"}, + + /* MOVSX / MOVSXD */ + {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"}, + {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"}, + {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"}, +#endif +}; + +static bool selftest_opcode_one(const struct selftest_opcode *op) +{ + unsigned size; + + kmemcheck_opcode_decode(op->insn, &size); + + if (size == op->expected_size) + return true; + + printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n", + op->desc, op->expected_size, size); + return false; +} + +static bool selftest_opcodes_all(void) +{ + bool pass = true; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i) + pass = pass && selftest_opcode_one(&selftest_opcodes[i]); + + return pass; +} + +bool kmemcheck_selftest(void) +{ + bool pass = true; + + pass = pass && selftest_opcodes_all(); + + return pass; +} diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h new file mode 100644 index 00000000000..8fed4fe11f9 --- /dev/null +++ b/arch/x86/mm/kmemcheck/selftest.h @@ -0,0 +1,6 @@ +#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H +#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H + +bool kmemcheck_selftest(void); + +#endif diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c new file mode 100644 index 00000000000..e773b6bd007 --- /dev/null +++ b/arch/x86/mm/kmemcheck/shadow.c @@ -0,0 +1,162 @@ +#include <linux/kmemcheck.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/module.h> + +#include <asm/page.h> +#include <asm/pgtable.h> + +#include "pte.h" +#include "shadow.h" + +/* + * Return the shadow address for the given address. Returns NULL if the + * address is not tracked. + * + * We need to be extremely careful not to follow any invalid pointers, + * because this function can be called for *any* possible address. + */ +void *kmemcheck_shadow_lookup(unsigned long address) +{ + pte_t *pte; + struct page *page; + + if (!virt_addr_valid(address)) + return NULL; + + pte = kmemcheck_pte_lookup(address); + if (!pte) + return NULL; + + page = virt_to_page(address); + if (!page->shadow) + return NULL; + return page->shadow + (address & (PAGE_SIZE - 1)); +} + +static void mark_shadow(void *address, unsigned int n, + enum kmemcheck_shadow status) +{ + unsigned long addr = (unsigned long) address; + unsigned long last_addr = addr + n - 1; + unsigned long page = addr & PAGE_MASK; + unsigned long last_page = last_addr & PAGE_MASK; + unsigned int first_n; + void *shadow; + + /* If the memory range crosses a page boundary, stop there. */ + if (page == last_page) + first_n = n; + else + first_n = page + PAGE_SIZE - addr; + + shadow = kmemcheck_shadow_lookup(addr); + if (shadow) + memset(shadow, status, first_n); + + addr += first_n; + n -= first_n; + + /* Do full-page memset()s. */ + while (n >= PAGE_SIZE) { + shadow = kmemcheck_shadow_lookup(addr); + if (shadow) + memset(shadow, status, PAGE_SIZE); + + addr += PAGE_SIZE; + n -= PAGE_SIZE; + } + + /* Do the remaining page, if any. */ + if (n > 0) { + shadow = kmemcheck_shadow_lookup(addr); + if (shadow) + memset(shadow, status, n); + } +} + +void kmemcheck_mark_unallocated(void *address, unsigned int n) +{ + mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED); +} + +void kmemcheck_mark_uninitialized(void *address, unsigned int n) +{ + mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED); +} + +/* + * Fill the shadow memory of the given address such that the memory at that + * address is marked as being initialized. + */ +void kmemcheck_mark_initialized(void *address, unsigned int n) +{ + mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED); +} +EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized); + +void kmemcheck_mark_freed(void *address, unsigned int n) +{ + mark_shadow(address, n, KMEMCHECK_SHADOW_FREED); +} + +void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; ++i) + kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE); +} + +void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; ++i) + kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE); +} + +void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; ++i) + kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE); +} + +enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) +{ + uint8_t *x; + unsigned int i; + + x = shadow; + +#ifdef CONFIG_KMEMCHECK_PARTIAL_OK + /* + * Make sure _some_ bytes are initialized. Gcc frequently generates + * code to access neighboring bytes. + */ + for (i = 0; i < size; ++i) { + if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) + return x[i]; + } +#else + /* All bytes must be initialized. */ + for (i = 0; i < size; ++i) { + if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) + return x[i]; + } +#endif + + return x[0]; +} + +void kmemcheck_shadow_set(void *shadow, unsigned int size) +{ + uint8_t *x; + unsigned int i; + + x = shadow; + for (i = 0; i < size; ++i) + x[i] = KMEMCHECK_SHADOW_INITIALIZED; +} diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h new file mode 100644 index 00000000000..af46d9ab9d8 --- /dev/null +++ b/arch/x86/mm/kmemcheck/shadow.h @@ -0,0 +1,16 @@ +#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H +#define ARCH__X86__MM__KMEMCHECK__SHADOW_H + +enum kmemcheck_shadow { + KMEMCHECK_SHADOW_UNALLOCATED, + KMEMCHECK_SHADOW_UNINITIALIZED, + KMEMCHECK_SHADOW_INITIALIZED, + KMEMCHECK_SHADOW_FREED, +}; + +void *kmemcheck_shadow_lookup(unsigned long address); + +enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); +void kmemcheck_shadow_set(void *shadow, unsigned int size); + +#endif diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 50dc802a1c4..16ccbd77917 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -32,7 +32,7 @@ struct kmmio_fault_page { struct list_head list; struct kmmio_fault_page *release_next; unsigned long page; /* location of the fault page */ - bool old_presence; /* page presence prior to arming */ + pteval_t old_presence; /* page presence prior to arming */ bool armed; /* @@ -97,60 +97,62 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr) static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) { struct list_head *head; - struct kmmio_fault_page *p; + struct kmmio_fault_page *f; page &= PAGE_MASK; head = kmmio_page_list(page); - list_for_each_entry_rcu(p, head, list) { - if (p->page == page) - return p; + list_for_each_entry_rcu(f, head, list) { + if (f->page == page) + return f; } return NULL; } -static void set_pmd_presence(pmd_t *pmd, bool present, bool *old) +static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) { pmdval_t v = pmd_val(*pmd); - *old = !!(v & _PAGE_PRESENT); - v &= ~_PAGE_PRESENT; - if (present) - v |= _PAGE_PRESENT; + if (clear) { + *old = v & _PAGE_PRESENT; + v &= ~_PAGE_PRESENT; + } else /* presume this has been called with clear==true previously */ + v |= *old; set_pmd(pmd, __pmd(v)); } -static void set_pte_presence(pte_t *pte, bool present, bool *old) +static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) { pteval_t v = pte_val(*pte); - *old = !!(v & _PAGE_PRESENT); - v &= ~_PAGE_PRESENT; - if (present) - v |= _PAGE_PRESENT; + if (clear) { + *old = v & _PAGE_PRESENT; + v &= ~_PAGE_PRESENT; + } else /* presume this has been called with clear==true previously */ + v |= *old; set_pte_atomic(pte, __pte(v)); } -static int set_page_presence(unsigned long addr, bool present, bool *old) +static int clear_page_presence(struct kmmio_fault_page *f, bool clear) { unsigned int level; - pte_t *pte = lookup_address(addr, &level); + pte_t *pte = lookup_address(f->page, &level); if (!pte) { - pr_err("kmmio: no pte for page 0x%08lx\n", addr); + pr_err("kmmio: no pte for page 0x%08lx\n", f->page); return -1; } switch (level) { case PG_LEVEL_2M: - set_pmd_presence((pmd_t *)pte, present, old); + clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence); break; case PG_LEVEL_4K: - set_pte_presence(pte, present, old); + clear_pte_presence(pte, clear, &f->old_presence); break; default: pr_err("kmmio: unexpected page level 0x%x.\n", level); return -1; } - __flush_tlb_one(addr); + __flush_tlb_one(f->page); return 0; } @@ -171,9 +173,9 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); if (f->armed) { pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", - f->page, f->count, f->old_presence); + f->page, f->count, !!f->old_presence); } - ret = set_page_presence(f->page, false, &f->old_presence); + ret = clear_page_presence(f, true); WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); f->armed = true; return ret; @@ -182,8 +184,7 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) /** Restore the given page to saved presence state. */ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) { - bool tmp; - int ret = set_page_presence(f->page, f->old_presence, &tmp); + int ret = clear_page_presence(f, false); WARN_ONCE(ret < 0, KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); f->armed = false; @@ -310,7 +311,12 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); if (!ctx->active) { - pr_debug("kmmio: spurious debug trap on CPU %d.\n", + /* + * debug traps without an active context are due to either + * something external causing them (f.e. using a debugger while + * mmio tracing enabled), or erroneous behaviour + */ + pr_warning("kmmio: unexpected debug trap on CPU %d.\n", smp_processor_id()); goto out; } @@ -439,12 +445,12 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head) head, struct kmmio_delayed_release, rcu); - struct kmmio_fault_page *p = dr->release_list; - while (p) { - struct kmmio_fault_page *next = p->release_next; - BUG_ON(p->count); - kfree(p); - p = next; + struct kmmio_fault_page *f = dr->release_list; + while (f) { + struct kmmio_fault_page *next = f->release_next; + BUG_ON(f->count); + kfree(f); + f = next; } kfree(dr); } @@ -453,19 +459,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) { struct kmmio_delayed_release *dr = container_of(head, struct kmmio_delayed_release, rcu); - struct kmmio_fault_page *p = dr->release_list; + struct kmmio_fault_page *f = dr->release_list; struct kmmio_fault_page **prevp = &dr->release_list; unsigned long flags; spin_lock_irqsave(&kmmio_lock, flags); - while (p) { - if (!p->count) { - list_del_rcu(&p->list); - prevp = &p->release_next; + while (f) { + if (!f->count) { + list_del_rcu(&f->list); + prevp = &f->release_next; } else { - *prevp = p->release_next; + *prevp = f->release_next; } - p = p->release_next; + f = f->release_next; } spin_unlock_irqrestore(&kmmio_lock, flags); @@ -528,8 +534,8 @@ void unregister_kmmio_probe(struct kmmio_probe *p) } EXPORT_SYMBOL(unregister_kmmio_probe); -static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, - void *args) +static int +kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) { struct die_args *arg = args; @@ -544,11 +550,23 @@ static struct notifier_block nb_die = { .notifier_call = kmmio_die_notifier }; -static int __init init_kmmio(void) +int kmmio_init(void) { int i; + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) INIT_LIST_HEAD(&kmmio_page_table[i]); + return register_die_notifier(&nb_die); } -fs_initcall(init_kmmio); /* should be before device_initcall() */ + +void kmmio_cleanup(void) +{ + int i; + + unregister_die_notifier(&nb_die); + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) { + WARN_ONCE(!list_empty(&kmmio_page_table[i]), + KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n"); + } +} diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 605c8be0621..18d244f7020 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -40,23 +40,22 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) static void __init memtest(u64 pattern, u64 start_phys, u64 size) { - u64 i, count; - u64 *start; + u64 *p, *start, *end; u64 start_bad, last_bad; u64 start_phys_aligned; - size_t incr; + const size_t incr = sizeof(pattern); - incr = sizeof(pattern); start_phys_aligned = ALIGN(start_phys, incr); - count = (size - (start_phys_aligned - start_phys))/incr; start = __va(start_phys_aligned); + end = start + (size - (start_phys_aligned - start_phys)) / incr; start_bad = 0; last_bad = 0; - for (i = 0; i < count; i++) - start[i] = pattern; - for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { - if (*start == pattern) + for (p = start; p < end; p++) + *p = pattern; + + for (p = start; p < end; p++, start_phys_aligned += incr) { + if (*p == pattern) continue; if (start_phys_aligned == last_bad + incr) { last_bad += incr; diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index c9342ed8b40..132772a8ec5 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -451,6 +451,7 @@ void enable_mmiotrace(void) if (nommiotrace) pr_info(NAME "MMIO tracing disabled.\n"); + kmmio_init(); enter_uniprocessor(); spin_lock_irq(&trace_lock); atomic_inc(&mmiotrace_enabled); @@ -473,6 +474,7 @@ void disable_mmiotrace(void) clear_trace_list(); /* guarantees: no more kmmio callbacks */ leave_uniprocessor(); + kmmio_cleanup(); pr_info(NAME "disabled.\n"); out: mutex_unlock(&mmiotrace_mutex); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 2d05a12029d..459913beac7 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -179,18 +179,25 @@ static void * __init early_node_mem(int nodeid, unsigned long start, } /* Initialize bootmem allocator for a node */ -void __init setup_node_bootmem(int nodeid, unsigned long start, - unsigned long end) +void __init +setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) { unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; + const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); unsigned long bootmap_start, nodedata_phys; void *bootmap; - const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); int nid; if (!end) return; + /* + * Don't confuse VM with a node that doesn't have the + * minimum amount of memory: + */ + if (end && (end - start) < NODE_MIN_SIZE) + return; + start = roundup(start, ZONE_ALIGN); printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, @@ -272,9 +279,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); -#ifdef CONFIG_ACPI_NUMA - srat_reserve_add_area(nodeid); -#endif node_set_online(nodeid); } @@ -578,21 +582,6 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } -void __init paging_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; - max_zone_pfns[ZONE_NORMAL] = max_pfn; - - sparse_memory_present_with_active_regions(MAX_NUMNODES); - sparse_init(); - - free_area_init_nodes(max_zone_pfns); -} - static __init int numa_setup(char *opt) { if (!opt) @@ -606,8 +595,6 @@ static __init int numa_setup(char *opt) #ifdef CONFIG_ACPI_NUMA if (!strncmp(opt, "noacpi", 6)) acpi_numa = -1; - if (!strncmp(opt, "hotadd=", 7)) - hotadd_percent = simple_strtoul(opt+7, NULL, 10); #endif return 0; } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e17efed088c..7e600c1962d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -11,6 +11,7 @@ #include <linux/interrupt.h> #include <linux/seq_file.h> #include <linux/debugfs.h> +#include <linux/pfn.h> #include <asm/e820.h> #include <asm/processor.h> @@ -470,7 +471,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) if (!debug_pagealloc) spin_unlock(&cpa_lock); - base = alloc_pages(GFP_KERNEL, 0); + base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); if (!debug_pagealloc) spin_lock(&cpa_lock); if (!base) @@ -590,9 +591,12 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) unsigned int level; pte_t *kpte, old_pte; - if (cpa->flags & CPA_PAGES_ARRAY) - address = (unsigned long)page_address(cpa->pages[cpa->curpage]); - else if (cpa->flags & CPA_ARRAY) + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[cpa->curpage]; + if (unlikely(PageHighMem(page))) + return 0; + address = (unsigned long)page_address(page); + } else if (cpa->flags & CPA_ARRAY) address = cpa->vaddr[cpa->curpage]; else address = *cpa->vaddr; @@ -681,8 +685,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; - int ret = 0; - unsigned long temp_cpa_vaddr, vaddr; + unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); + unsigned long vaddr, remapped; + int ret; if (cpa->pfn >= max_pfn_mapped) return 0; @@ -695,9 +700,12 @@ static int cpa_process_alias(struct cpa_data *cpa) * No need to redo, when the primary call touched the direct * mapping already: */ - if (cpa->flags & CPA_PAGES_ARRAY) - vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]); - else if (cpa->flags & CPA_ARRAY) + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[cpa->curpage]; + if (unlikely(PageHighMem(page))) + return 0; + vaddr = (unsigned long)page_address(page); + } else if (cpa->flags & CPA_ARRAY) vaddr = cpa->vaddr[cpa->curpage]; else vaddr = *cpa->vaddr; @@ -706,42 +714,55 @@ static int cpa_process_alias(struct cpa_data *cpa) PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { alias_cpa = *cpa; - temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); - alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.vaddr = &laddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; } #ifdef CONFIG_X86_64 - if (ret) - return ret; /* - * No need to redo, when the primary call touched the high - * mapping already: - */ - if (within(vaddr, (unsigned long) _text, _brk_end)) - return 0; - - /* - * If the physical address is inside the kernel map, we need + * If the primary call didn't touch the high mapping already + * and the physical address is inside the kernel map, we need * to touch the high mapped kernel as well: */ - if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) - return 0; + if (!within(vaddr, (unsigned long)_text, _brk_end) && + within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { + unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + + __START_KERNEL_map - phys_base; + alias_cpa = *cpa; + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - alias_cpa = *cpa; - temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; - alias_cpa.vaddr = &temp_cpa_vaddr; - alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + /* + * The high mapping range is imprecise, so ignore the + * return value. + */ + __change_page_attr_set_clr(&alias_cpa, 0); + } +#endif /* - * The high mapping range is imprecise, so ignore the return value. + * If the PMD page was partially used for per-cpu remapping, + * the recycled area needs to be split and modified. Because + * the area is always proper subset of a PMD page + * cpa->numpages is guaranteed to be 1 for these areas, so + * there's no need to loop over and check for further remaps. */ - __change_page_attr_set_clr(&alias_cpa, 0); -#endif - return ret; + remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); + if (remapped) { + WARN_ON(cpa->numpages > 1); + alias_cpa = *cpa; + alias_cpa.vaddr = &remapped; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; + } + + return 0; } static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) @@ -839,13 +860,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, vm_unmap_aliases(); - /* - * If we're called with lazy mmu updates enabled, the - * in-memory pte state may be stale. Flush pending updates to - * bring them up to date. - */ - arch_flush_lazy_mmu_mode(); - cpa.vaddr = addr; cpa.pages = pages; cpa.numpages = numpages; @@ -890,13 +904,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, } else cpa_flush_all(cache); - /* - * If we've been called with lazy mmu updates enabled, then - * make sure that everything gets flushed out before we - * return. - */ - arch_flush_lazy_mmu_mode(); - out: return ret; } @@ -996,12 +1003,15 @@ EXPORT_SYMBOL(set_memory_array_uc); int _set_memory_wc(unsigned long addr, int numpages) { int ret; + unsigned long addr_copy = addr; + ret = change_page_attr_set(&addr, numpages, __pgprot(_PAGE_CACHE_UC_MINUS), 0); - if (!ret) { - ret = change_page_attr_set(&addr, numpages, - __pgprot(_PAGE_CACHE_WC), 0); + ret = change_page_attr_set_clr(&addr_copy, numpages, + __pgprot(_PAGE_CACHE_WC), + __pgprot(_PAGE_CACHE_MASK), + 0, 0, NULL); } return ret; } @@ -1118,7 +1128,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray) int free_idx; for (i = 0; i < addrinarray; i++) { - start = (unsigned long)page_address(pages[i]); + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) goto err_out; @@ -1131,7 +1143,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray) err_out: free_idx = i; for (i = 0; i < free_idx; i++) { - start = (unsigned long)page_address(pages[i]); + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; free_memtype(start, end); } @@ -1160,7 +1174,9 @@ int set_pages_array_wb(struct page **pages, int addrinarray) return retval; for (i = 0; i < addrinarray; i++) { - start = (unsigned long)page_address(pages[i]); + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; free_memtype(start, end); } diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index e6718bb2806..352aa9e927e 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -623,7 +623,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, return ret; if (flags != want_flags) { - if (strict_prot || !is_new_memtype_allowed(want_flags, flags)) { + if (strict_prot || + !is_new_memtype_allowed(paddr, size, want_flags, flags)) { free_memtype(paddr, paddr + size); printk(KERN_ERR "%s:%d map pfn expected mapping type %s" " for %Lx-%Lx, got %s\n", diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 7aa03a5389f..ed34f5e3599 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -4,9 +4,11 @@ #include <asm/tlb.h> #include <asm/fixmap.h> +#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO + pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + return (pte_t *)__get_free_page(PGALLOC_GFP); } pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) @@ -14,16 +16,16 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) struct page *pte; #ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); + pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0); #else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + pte = alloc_pages(PGALLOC_GFP, 0); #endif if (pte) pgtable_page_ctor(pte); return pte; } -void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) +void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); @@ -31,14 +33,14 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) } #if PAGETABLE_LEVELS > 2 -void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pmd)); } #if PAGETABLE_LEVELS > 3 -void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pud)); @@ -161,7 +163,7 @@ static int preallocate_pmds(pmd_t *pmds[]) bool failed = false; for(i = 0; i < PREALLOCATED_PMDS; i++) { - pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); if (pmd == NULL) failed = true; pmds[i] = pmd; @@ -228,7 +230,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_t *pmds[PREALLOCATED_PMDS]; unsigned long flags; - pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); if (pgd == NULL) goto out; @@ -327,7 +329,6 @@ void __init reserve_top_address(unsigned long reserve) printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", (int)-reserve); __FIXADDR_TOP = -reserve - PAGE_SIZE; - __VMALLOC_RESERVE += reserve; #endif } diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 01765955baa..dbb5381f7b3 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -31,17 +31,11 @@ static nodemask_t nodes_parsed __initdata; static nodemask_t cpu_nodes_parsed __initdata; static struct bootnode nodes[MAX_NUMNODES] __initdata; static struct bootnode nodes_add[MAX_NUMNODES]; -static int found_add_area __initdata; -int hotadd_percent __initdata = 0; static int num_node_memblks __initdata; static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; -/* Too small nodes confuse the VM badly. Usually they result - from BIOS bugs. */ -#define NODE_MIN_SIZE (4*1024*1024) - static __init int setup_node(int pxm) { return acpi_map_pxm_to_node(pxm); @@ -66,9 +60,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end) { struct bootnode *nd = &nodes[i]; - if (found_add_area) - return; - if (nd->start < start) { nd->start = start; if (nd->end < nd->start) @@ -86,11 +77,12 @@ static __init void bad_srat(void) int i; printk(KERN_ERR "SRAT: SRAT not used.\n"); acpi_numa = -1; - found_add_area = 0; for (i = 0; i < MAX_LOCAL_APIC; i++) apicid_to_node[i] = NUMA_NO_NODE; - for (i = 0; i < MAX_NUMNODES; i++) - nodes_add[i].start = nodes[i].end = 0; + for (i = 0; i < MAX_NUMNODES; i++) { + nodes[i].start = nodes[i].end = 0; + nodes_add[i].start = nodes_add[i].end = 0; + } remove_all_active_ranges(); } @@ -182,24 +174,21 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) pxm, apic_id, node); } -static int update_end_of_memory(unsigned long end) {return -1;} -static int hotadd_enough_memory(struct bootnode *nd) {return 1;} #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static inline int save_add_info(void) {return 1;} #else static inline int save_add_info(void) {return 0;} #endif /* - * Update nodes_add and decide if to include add are in the zone. - * Both SPARSE and RESERVE need nodes_add information. - * This code supports one contiguous hot add area per node. + * Update nodes_add[] + * This code supports one contiguous hot add area per node */ -static int __init -reserve_hotadd(int node, unsigned long start, unsigned long end) +static void __init +update_nodes_add(int node, unsigned long start, unsigned long end) { unsigned long s_pfn = start >> PAGE_SHIFT; unsigned long e_pfn = end >> PAGE_SHIFT; - int ret = 0, changed = 0; + int changed = 0; struct bootnode *nd = &nodes_add[node]; /* I had some trouble with strange memory hotadd regions breaking @@ -210,7 +199,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end) mistakes */ if ((signed long)(end - start) < NODE_MIN_SIZE) { printk(KERN_ERR "SRAT: Hotplug area too small\n"); - return -1; + return; } /* This check might be a bit too strict, but I'm keeping it for now. */ @@ -218,12 +207,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end) printk(KERN_ERR "SRAT: Hotplug area %lu -> %lu has existing memory\n", s_pfn, e_pfn); - return -1; - } - - if (!hotadd_enough_memory(&nodes_add[node])) { - printk(KERN_ERR "SRAT: Hotplug area too large\n"); - return -1; + return; } /* Looks good */ @@ -245,11 +229,9 @@ reserve_hotadd(int node, unsigned long start, unsigned long end) printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); } - ret = update_end_of_memory(nd->end); - if (changed) - printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); - return ret; + printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", + nd->start, nd->end); } /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ @@ -310,13 +292,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) start, end); e820_register_active_regions(node, start >> PAGE_SHIFT, end >> PAGE_SHIFT); - push_node_boundaries(node, nd->start >> PAGE_SHIFT, - nd->end >> PAGE_SHIFT); - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && - (reserve_hotadd(node, start, end) < 0)) { - /* Ignore hotadd region. Undo damage */ - printk(KERN_NOTICE "SRAT: Hotplug region ignored\n"); + if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { + update_nodes_add(node, start, end); + /* restore nodes[node] */ *nd = oldnode; if ((nd->start | nd->end) == 0) node_clear(node, nodes_parsed); @@ -345,9 +324,9 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) pxmram = 0; } - e820ram = max_pfn - absent_pages_in_range(0, max_pfn); - /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ - if ((long)(e820ram - pxmram) >= 1*1024*1024) { + e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT); + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ + if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) { printk(KERN_ERR "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", (pxmram << PAGE_SHIFT) >> 20, @@ -357,17 +336,6 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) return 1; } -static void __init unparse_node(int node) -{ - int i; - node_clear(node, nodes_parsed); - node_clear(node, cpu_nodes_parsed); - for (i = 0; i < MAX_LOCAL_APIC; i++) { - if (apicid_to_node[i] == node) - apicid_to_node[i] = NUMA_NO_NODE; - } -} - void __init acpi_numa_arch_fixup(void) {} /* Use the information discovered above to actually set up the nodes. */ @@ -379,18 +347,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; /* First clean up the node list */ - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < MAX_NUMNODES; i++) cutoff_node(i, start, end); - /* - * don't confuse VM with a node that doesn't have the - * minimum memory. - */ - if (nodes[i].end && - (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { - unparse_node(i); - node_set_offline(i); - } - } if (!nodes_cover_memory(nodes)) { bad_srat(); @@ -423,7 +381,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) if (node == NUMA_NO_NODE) continue; - if (!node_isset(node, node_possible_map)) + if (!node_online(node)) numa_clear_node(i); } numa_init_array(); @@ -510,26 +468,6 @@ static int null_slit_node_compare(int a, int b) } #endif /* CONFIG_NUMA_EMU */ -void __init srat_reserve_add_area(int nodeid) -{ - if (found_add_area && nodes_add[nodeid].end) { - u64 total_mb; - - printk(KERN_INFO "SRAT: Reserving hot-add memory space " - "for node %d at %Lx-%Lx\n", - nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end); - total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start) - >> PAGE_SHIFT; - total_mb *= sizeof(struct page); - total_mb >>= 20; - printk(KERN_INFO "SRAT: This will cost you %Lu MB of " - "pre-allocated memory.\n", (unsigned long long)total_mb); - reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, - nodes_add[nodeid].end - nodes_add[nodeid].start, - BOOTMEM_DEFAULT); - } -} - int __node_distance(int a, int b) { int index; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 821e97017e9..c814e144a3f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -183,18 +183,17 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, f->flush_mm = mm; f->flush_va = va; - cpumask_andnot(to_cpumask(f->flush_cpumask), - cpumask, cpumask_of(smp_processor_id())); - - /* - * We have to send the IPI only to - * CPUs affected. - */ - apic->send_IPI_mask(to_cpumask(f->flush_cpumask), - INVALIDATE_TLB_VECTOR_START + sender); + if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) { + /* + * We have to send the IPI only to + * CPUs affected. + */ + apic->send_IPI_mask(to_cpumask(f->flush_cpumask), + INVALIDATE_TLB_VECTOR_START + sender); - while (!cpumask_empty(to_cpumask(f->flush_cpumask))) - cpu_relax(); + while (!cpumask_empty(to_cpumask(f->flush_cpumask))) + cpu_relax(); + } f->flush_mm = NULL; f->flush_va = 0; diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 202864ad49a..89b9a5cd63d 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self, switch (val) { case DIE_NMI: - if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu))) - ret = NOTIFY_STOP; + case DIE_NMI_IPI: + model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)); + ret = NOTIFY_STOP; break; default: break; @@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy) static struct notifier_block profile_exceptions_nb = { .notifier_call = profile_exceptions_notify, .next = NULL, - .priority = 0 + .priority = 2 }; static int nmi_setup(void) @@ -356,14 +357,11 @@ static void exit_sysfs(void) #define exit_sysfs() do { } while (0) #endif /* CONFIG_PM */ -static int p4force; -module_param(p4force, int, 0); - static int __init p4_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; - if (!p4force && (cpu_model > 6 || cpu_model == 5)) + if (cpu_model > 6 || cpu_model == 5) return 0; #ifndef CONFIG_SMP @@ -389,10 +387,25 @@ static int __init p4_init(char **cpu_type) return 0; } +static int force_arch_perfmon; +static int force_cpu_type(const char *str, struct kernel_param *kp) +{ + if (!strcmp(str, "arch_perfmon")) { + force_arch_perfmon = 1; + printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); + } + + return 0; +} +module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0); + static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; + if (force_arch_perfmon && cpu_has_arch_perfmon) + return 0; + switch (cpu_model) { case 0 ... 2: *cpu_type = "i386/ppro"; @@ -414,6 +427,13 @@ static int __init ppro_init(char **cpu_type) case 15: case 23: *cpu_type = "i386/core_2"; break; + case 26: + arch_perfmon_setup_counters(); + *cpu_type = "i386/core_i7"; + break; + case 28: + *cpu_type = "i386/atom"; + break; default: /* Unknown */ return 0; diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 10131fbdaad..4da7230b3d1 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -18,7 +18,7 @@ #include <asm/msr.h> #include <asm/apic.h> #include <asm/nmi.h> -#include <asm/intel_arch_perfmon.h> +#include <asm/perf_counter.h> #include "op_x86_model.h" #include "op_counter.h" @@ -136,6 +136,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs, u64 val; int i; + /* + * This can happen if perf counters are in use when + * we steal the die notifier NMI. + */ + if (unlikely(!reset_value)) + goto out; + for (i = 0 ; i < num_counters; ++i) { if (!reset_value[i]) continue; @@ -146,6 +153,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs, } } +out: /* Only P6 based Pentium M need to re-unmask the apic vector but it * doesn't hurt other P6 variant */ apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index c0ecf250fe5..1014eb4bfc3 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -38,15 +38,26 @@ count_resource(struct acpi_resource *acpi_res, void *data) struct acpi_resource_address64 addr; acpi_status status; - if (info->res_num >= PCI_BUS_NUM_RESOURCES) - return AE_OK; - status = resource_to_addr(acpi_res, &addr); if (ACPI_SUCCESS(status)) info->res_num++; return AE_OK; } +static int +bus_has_transparent_bridge(struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + u16 class = dev->class >> 8; + + if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent) + return true; + } + return false; +} + static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data) { @@ -56,9 +67,11 @@ setup_resource(struct acpi_resource *acpi_res, void *data) acpi_status status; unsigned long flags; struct resource *root; + int max_root_bus_resources = PCI_BUS_NUM_RESOURCES; + u64 start, end; - if (info->res_num >= PCI_BUS_NUM_RESOURCES) - return AE_OK; + if (bus_has_transparent_bridge(info->bus)) + max_root_bus_resources -= 3; status = resource_to_addr(acpi_res, &addr); if (!ACPI_SUCCESS(status)) @@ -75,11 +88,22 @@ setup_resource(struct acpi_resource *acpi_res, void *data) } else return AE_OK; + start = addr.minimum + addr.translation_offset; + end = start + addr.address_length - 1; + if (info->res_num >= max_root_bus_resources) { + printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx " + "from %s for %s due to _CRS returning more than " + "%d resource descriptors\n", (unsigned long) start, + (unsigned long) end, root->name, info->name, + max_root_bus_resources); + return AE_OK; + } + res = &info->res[info->res_num]; res->name = info->name; res->flags = flags; - res->start = addr.minimum + addr.translation_offset; - res->end = res->start + addr.address_length - 1; + res->start = start; + res->end = end; res->child = NULL; if (insert_resource(root, res)) { @@ -94,23 +118,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data) } static void -adjust_transparent_bridge_resources(struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - int i; - u16 class = dev->class >> 8; - - if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent) { - for(i = 3; i < PCI_BUS_NUM_RESOURCES; i++) - dev->subordinate->resource[i] = - dev->bus->resource[i - 3]; - } - } -} - -static void get_current_resources(struct acpi_device *device, int busnum, int domain, struct pci_bus *bus) { @@ -137,8 +144,6 @@ get_current_resources(struct acpi_device *device, int busnum, info.res_num = 0; acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, &info); - if (info.res_num) - adjust_transparent_bridge_resources(bus); return; @@ -201,8 +206,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do */ memcpy(bus->sysdata, sd, sizeof(*sd)); kfree(sd); - } else - bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); + } else { + bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); + if (bus) { + if (pci_probe & PCI_USE__CRS) + get_current_resources(device, busnum, domain, + bus); + bus->subordinate = pci_scan_child_bus(bus); + } + } if (!bus) kfree(sd); @@ -217,8 +229,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do #endif } - if (bus && (pci_probe & PCI_USE__CRS)) - get_current_resources(device, busnum, domain, bus); return bus; } diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index f893d6a6e80..3ffa10df20b 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -100,8 +100,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) int j; struct pci_root_info *info; - /* don't go for it if _CRS is used */ - if (pci_probe & PCI_USE__CRS) + /* don't go for it if _CRS is used already */ + if (b->resource[0] != &ioport_resource || + b->resource[1] != &iomem_resource) return; /* if only one root bus, don't need to anything */ @@ -116,6 +117,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) if (i == pci_root_num) return; + printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", + b->number); + info = &pci_root_info[i]; for (j = 0; j < info->res_num; j++) { struct resource *res; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index a85bef20a3b..52e62e57fed 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -35,6 +35,7 @@ #include <asm/pat.h> #include <asm/e820.h> #include <asm/pci_x86.h> +#include <asm/io_apic.h> static int @@ -116,7 +117,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) struct pci_bus *bus; struct pci_dev *dev; int idx; - struct resource *r, *pr; + struct resource *r; /* Depth-First Search on bus tree */ list_for_each_entry(bus, bus_list, node) { @@ -126,9 +127,8 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) r = &dev->resource[idx]; if (!r->flags) continue; - pr = pci_find_parent_resource(dev, r); - if (!r->start || !pr || - request_resource(pr, r) < 0) { + if (!r->start || + pci_claim_resource(dev, idx) < 0) { dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); /* * Something is wrong with the region. @@ -149,7 +149,7 @@ static void __init pcibios_allocate_resources(int pass) struct pci_dev *dev = NULL; int idx, disabled; u16 command; - struct resource *r, *pr; + struct resource *r; for_each_pci_dev(dev) { pci_read_config_word(dev, PCI_COMMAND, &command); @@ -168,8 +168,7 @@ static void __init pcibios_allocate_resources(int pass) (unsigned long long) r->start, (unsigned long long) r->end, r->flags, disabled, pass); - pr = pci_find_parent_resource(dev, r); - if (!pr || request_resource(pr, r) < 0) { + if (pci_claim_resource(dev, idx) < 0) { dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); /* We'll assign a new address later */ r->end -= r->start; @@ -197,7 +196,7 @@ static void __init pcibios_allocate_resources(int pass) static int __init pcibios_assign_resources(void) { struct pci_dev *dev = NULL; - struct resource *r, *pr; + struct resource *r; if (!(pci_probe & PCI_ASSIGN_ROMS)) { /* @@ -209,8 +208,7 @@ static int __init pcibios_assign_resources(void) r = &dev->resource[PCI_ROM_RESOURCE]; if (!r->flags || !r->start) continue; - pr = pci_find_parent_resource(dev, r); - if (!pr || request_resource(pr, r) < 0) { + if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) { r->end -= r->start; r->start = 0; } @@ -230,6 +228,12 @@ void __init pcibios_resource_survey(void) pcibios_allocate_resources(1); e820_reserve_resources_late(); + /* + * Insert the IO APIC resources after PCI initialization has + * occured to handle IO APICS that are mapped in on a BAR in + * PCI space, but before trying to assign unassigned pci res. + */ + ioapic_insert_resources(); } /** diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index fecbce6e7d7..0696d506c4a 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -889,6 +889,9 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) return 0; } + if (io_apic_assign_pci_irqs) + return 0; + /* Find IRQ routing entry */ if (!pirq_table) @@ -1039,56 +1042,15 @@ static void __init pcibios_fixup_irqs(void) pirq_penalty[dev->irq]++; } + if (io_apic_assign_pci_irqs) + return; + dev = NULL; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); if (!pin) continue; -#ifdef CONFIG_X86_IO_APIC - /* - * Recalculate IRQ numbers if we use the I/O APIC. - */ - if (io_apic_assign_pci_irqs) { - int irq; - - /* - * interrupt pins are numbered starting from 1 - */ - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, - PCI_SLOT(dev->devfn), pin - 1); - /* - * Busses behind bridges are typically not listed in the - * MP-table. In this case we have to look up the IRQ - * based on the parent bus, parent slot, and pin number. - * The SMP code detects such bridged busses itself so we - * should get into this branch reliably. - */ - if (irq < 0 && dev->bus->parent) { - /* go back to the bridge */ - struct pci_dev *bridge = dev->bus->self; - int bus; - - pin = pci_swizzle_interrupt_pin(dev, pin); - bus = bridge->bus->number; - irq = IO_APIC_get_PCI_irq_vector(bus, - PCI_SLOT(bridge->devfn), pin - 1); - if (irq >= 0) - dev_warn(&dev->dev, - "using bridge %s INT %c to " - "get IRQ %d\n", - pci_name(bridge), - 'A' + pin - 1, irq); - } - if (irq >= 0) { - dev_info(&dev->dev, - "PCI->APIC IRQ transform: INT %c " - "-> IRQ %d\n", - 'A' + pin - 1, irq); - dev->irq = irq; - } - } -#endif /* * Still no IRQ? Try to lookup one... */ @@ -1183,6 +1145,19 @@ int __init pcibios_irq_init(void) pcibios_enable_irq = pirq_enable_irq; pcibios_fixup_irqs(); + + if (io_apic_assign_pci_irqs && pci_routeirq) { + struct pci_dev *dev = NULL; + /* + * PCI IRQ routing is set up by pci_enable_device(), but we + * also do it here in case there are still broken drivers that + * don't use pci_enable_device(). + */ + printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n"); + for_each_pci_dev(dev) + pirq_enable_irq(dev); + } + return 0; } @@ -1213,16 +1188,23 @@ void pcibios_penalize_isa_irq(int irq, int active) static int pirq_enable_irq(struct pci_dev *dev) { u8 pin; - struct pci_dev *temp_dev; pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); - if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { + if (pin && !pcibios_lookup_irq(dev, 1)) { char *msg = ""; + if (!io_apic_assign_pci_irqs && dev->irq) + return 0; + if (io_apic_assign_pci_irqs) { +#ifdef CONFIG_X86_IO_APIC + struct pci_dev *temp_dev; int irq; + struct io_apic_irq_attr irq_attr; - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1); + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, + PCI_SLOT(dev->devfn), + pin - 1, &irq_attr); /* * Busses behind bridges are typically not listed in the MP-table. * In this case we have to look up the IRQ based on the parent bus, @@ -1235,7 +1217,8 @@ static int pirq_enable_irq(struct pci_dev *dev) pin = pci_swizzle_interrupt_pin(dev, pin); irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, - PCI_SLOT(bridge->devfn), pin - 1); + PCI_SLOT(bridge->devfn), + pin - 1, &irq_attr); if (irq >= 0) dev_warn(&dev->dev, "using bridge %s " "INT %c to get IRQ %d\n", @@ -1245,12 +1228,15 @@ static int pirq_enable_irq(struct pci_dev *dev) } dev = temp_dev; if (irq >= 0) { + io_apic_set_pci_routing(&dev->dev, irq, + &irq_attr); + dev->irq = irq; dev_info(&dev->dev, "PCI->APIC IRQ transform: " "INT %c -> IRQ %d\n", 'A' + pin - 1, irq); - dev->irq = irq; return 0; } else msg = "; probably buggy MP table"; +#endif } else if (pci_probe & PCI_BIOS_IRQ_SCAN) msg = ""; else diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 8766b0e216c..712443ec6d4 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -523,6 +523,69 @@ reject: static int __initdata known_bridge; +static int acpi_mcfg_64bit_base_addr __initdata = FALSE; + +/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ +struct acpi_mcfg_allocation *pci_mmcfg_config; +int pci_mmcfg_config_num; + +static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) +{ + if (!strcmp(mcfg->header.oem_id, "SGI")) + acpi_mcfg_64bit_base_addr = TRUE; + + return 0; +} + +static int __init pci_parse_mcfg(struct acpi_table_header *header) +{ + struct acpi_table_mcfg *mcfg; + unsigned long i; + int config_size; + + if (!header) + return -EINVAL; + + mcfg = (struct acpi_table_mcfg *)header; + + /* how many config structures do we have */ + pci_mmcfg_config_num = 0; + i = header->length - sizeof(struct acpi_table_mcfg); + while (i >= sizeof(struct acpi_mcfg_allocation)) { + ++pci_mmcfg_config_num; + i -= sizeof(struct acpi_mcfg_allocation); + }; + if (pci_mmcfg_config_num == 0) { + printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); + return -ENODEV; + } + + config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); + pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); + if (!pci_mmcfg_config) { + printk(KERN_WARNING PREFIX + "No memory for MCFG config tables\n"); + return -ENOMEM; + } + + memcpy(pci_mmcfg_config, &mcfg[1], config_size); + + acpi_mcfg_oem_check(mcfg); + + for (i = 0; i < pci_mmcfg_config_num; ++i) { + if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) && + !acpi_mcfg_64bit_base_addr) { + printk(KERN_ERR PREFIX + "MMCONFIG not in low 4GB of memory\n"); + kfree(pci_mmcfg_config); + pci_mmcfg_config_num = 0; + return -ENODEV; + } + } + + return 0; +} + static void __init __pci_mmcfg_init(int early) { /* MMCONFIG disabled */ @@ -543,7 +606,7 @@ static void __init __pci_mmcfg_init(int early) } if (!known_bridge) - acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); + acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); pci_mmcfg_reject_broken(early); diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index 58b32db3312..a6a198c3362 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -1,7 +1,7 @@ # __restore_processor_state() restores %gs after S3 resume and so should not # itself be stack-protected nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_cpu_$(BITS).o := $(nostackp) +CFLAGS_cpu.o := $(nostackp) -obj-$(CONFIG_PM_SLEEP) += cpu_$(BITS).o +obj-$(CONFIG_PM_SLEEP) += cpu.o obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu.c index 5343540f260..b3d20b9cac6 100644 --- a/arch/x86/power/cpu_64.c +++ b/arch/x86/power/cpu.c @@ -1,5 +1,5 @@ /* - * Suspend and hibernation support for x86-64 + * Suspend support specific for i386/x86-64. * * Distribute under GPLv2 * @@ -8,18 +8,28 @@ * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> */ -#include <linux/smp.h> #include <linux/suspend.h> -#include <asm/proto.h> -#include <asm/page.h> +#include <linux/smp.h> + #include <asm/pgtable.h> +#include <asm/proto.h> #include <asm/mtrr.h> +#include <asm/page.h> +#include <asm/mce.h> #include <asm/xcr.h> #include <asm/suspend.h> -static void fix_processor_context(void); +#ifdef CONFIG_X86_32 +static struct saved_context saved_context; +unsigned long saved_context_ebx; +unsigned long saved_context_esp, saved_context_ebp; +unsigned long saved_context_esi, saved_context_edi; +unsigned long saved_context_eflags; +#else +/* CONFIG_X86_64 */ struct saved_context saved_context; +#endif /** * __save_processor_state - save CPU registers before creating a @@ -38,19 +48,35 @@ struct saved_context saved_context; */ static void __save_processor_state(struct saved_context *ctxt) { +#ifdef CONFIG_X86_32 + mtrr_save_fixed_ranges(NULL); +#endif kernel_fpu_begin(); /* * descriptor tables */ +#ifdef CONFIG_X86_32 + store_gdt(&ctxt->gdt); + store_idt(&ctxt->idt); +#else +/* CONFIG_X86_64 */ store_gdt((struct desc_ptr *)&ctxt->gdt_limit); store_idt((struct desc_ptr *)&ctxt->idt_limit); +#endif store_tr(ctxt->tr); /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ /* * segment registers */ +#ifdef CONFIG_X86_32 + savesegment(es, ctxt->es); + savesegment(fs, ctxt->fs); + savesegment(gs, ctxt->gs); + savesegment(ss, ctxt->ss); +#else +/* CONFIG_X86_64 */ asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); @@ -62,30 +88,87 @@ static void __save_processor_state(struct saved_context *ctxt) rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); mtrr_save_fixed_ranges(NULL); + rdmsrl(MSR_EFER, ctxt->efer); +#endif + /* * control registers */ - rdmsrl(MSR_EFER, ctxt->efer); ctxt->cr0 = read_cr0(); ctxt->cr2 = read_cr2(); ctxt->cr3 = read_cr3(); +#ifdef CONFIG_X86_32 + ctxt->cr4 = read_cr4_safe(); +#else +/* CONFIG_X86_64 */ ctxt->cr4 = read_cr4(); ctxt->cr8 = read_cr8(); +#endif } +/* Needed by apm.c */ void save_processor_state(void) { __save_processor_state(&saved_context); } +#ifdef CONFIG_X86_32 +EXPORT_SYMBOL(save_processor_state); +#endif static void do_fpu_end(void) { /* - * Restore FPU regs if necessary + * Restore FPU regs if necessary. */ kernel_fpu_end(); } +static void fix_processor_context(void) +{ + int cpu = smp_processor_id(); + struct tss_struct *t = &per_cpu(init_tss, cpu); + + set_tss_desc(cpu, t); /* + * This just modifies memory; should not be + * necessary. But... This is necessary, because + * 386 hardware has concept of busy TSS or some + * similar stupidity. + */ + +#ifdef CONFIG_X86_64 + get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; + + syscall_init(); /* This sets MSR_*STAR and related */ +#endif + load_TR_desc(); /* This does ltr */ + load_LDT(¤t->active_mm->context); /* This does lldt */ + + /* + * Now maybe reload the debug registers + */ + if (current->thread.debugreg7) { +#ifdef CONFIG_X86_32 + set_debugreg(current->thread.debugreg0, 0); + set_debugreg(current->thread.debugreg1, 1); + set_debugreg(current->thread.debugreg2, 2); + set_debugreg(current->thread.debugreg3, 3); + /* no 4 and 5 */ + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(current->thread.debugreg7, 7); +#else + /* CONFIG_X86_64 */ + loaddebug(¤t->thread, 0); + loaddebug(¤t->thread, 1); + loaddebug(¤t->thread, 2); + loaddebug(¤t->thread, 3); + /* no 4 and 5 */ + loaddebug(¤t->thread, 6); + loaddebug(¤t->thread, 7); +#endif + } + +} + /** * __restore_processor_state - restore the contents of CPU registers saved * by __save_processor_state() @@ -96,9 +179,16 @@ static void __restore_processor_state(struct saved_context *ctxt) /* * control registers */ + /* cr4 was introduced in the Pentium CPU */ +#ifdef CONFIG_X86_32 + if (ctxt->cr4) + write_cr4(ctxt->cr4); +#else +/* CONFIG X86_64 */ wrmsrl(MSR_EFER, ctxt->efer); write_cr8(ctxt->cr8); write_cr4(ctxt->cr4); +#endif write_cr3(ctxt->cr3); write_cr2(ctxt->cr2); write_cr0(ctxt->cr0); @@ -107,13 +197,31 @@ static void __restore_processor_state(struct saved_context *ctxt) * now restore the descriptor tables to their proper values * ltr is done i fix_processor_context(). */ +#ifdef CONFIG_X86_32 + load_gdt(&ctxt->gdt); + load_idt(&ctxt->idt); +#else +/* CONFIG_X86_64 */ load_gdt((const struct desc_ptr *)&ctxt->gdt_limit); load_idt((const struct desc_ptr *)&ctxt->idt_limit); - +#endif /* * segment registers */ +#ifdef CONFIG_X86_32 + loadsegment(es, ctxt->es); + loadsegment(fs, ctxt->fs); + loadsegment(gs, ctxt->gs); + loadsegment(ss, ctxt->ss); + + /* + * sysenter MSRs + */ + if (boot_cpu_has(X86_FEATURE_SEP)) + enable_sep_cpu(); +#else +/* CONFIG_X86_64 */ asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); @@ -123,6 +231,7 @@ static void __restore_processor_state(struct saved_context *ctxt) wrmsrl(MSR_FS_BASE, ctxt->fs_base); wrmsrl(MSR_GS_BASE, ctxt->gs_base); wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); +#endif /* * restore XCR0 for xsave capable cpu's. @@ -134,41 +243,17 @@ static void __restore_processor_state(struct saved_context *ctxt) do_fpu_end(); mtrr_ap_init(); + +#ifdef CONFIG_X86_OLD_MCE + mcheck_init(&boot_cpu_data); +#endif } +/* Needed by apm.c */ void restore_processor_state(void) { __restore_processor_state(&saved_context); } - -static void fix_processor_context(void) -{ - int cpu = smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - - /* - * This just modifies memory; should not be necessary. But... This - * is necessary, because 386 hardware has concept of busy TSS or some - * similar stupidity. - */ - set_tss_desc(cpu, t); - - get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; - - syscall_init(); /* This sets MSR_*STAR and related */ - load_TR_desc(); /* This does ltr */ - load_LDT(¤t->active_mm->context); /* This does lldt */ - - /* - * Now maybe reload the debug registers - */ - if (current->thread.debugreg7){ - loaddebug(¤t->thread, 0); - loaddebug(¤t->thread, 1); - loaddebug(¤t->thread, 2); - loaddebug(¤t->thread, 3); - /* no 4 and 5 */ - loaddebug(¤t->thread, 6); - loaddebug(¤t->thread, 7); - } -} +#ifdef CONFIG_X86_32 +EXPORT_SYMBOL(restore_processor_state); +#endif diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c deleted file mode 100644 index ce702c5b3a2..00000000000 --- a/arch/x86/power/cpu_32.c +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Suspend support specific for i386. - * - * Distribute under GPLv2 - * - * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> - * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> - */ - -#include <linux/module.h> -#include <linux/suspend.h> -#include <asm/mtrr.h> -#include <asm/mce.h> -#include <asm/xcr.h> -#include <asm/suspend.h> - -static struct saved_context saved_context; - -unsigned long saved_context_ebx; -unsigned long saved_context_esp, saved_context_ebp; -unsigned long saved_context_esi, saved_context_edi; -unsigned long saved_context_eflags; - -static void __save_processor_state(struct saved_context *ctxt) -{ - mtrr_save_fixed_ranges(NULL); - kernel_fpu_begin(); - - /* - * descriptor tables - */ - store_gdt(&ctxt->gdt); - store_idt(&ctxt->idt); - store_tr(ctxt->tr); - - /* - * segment registers - */ - savesegment(es, ctxt->es); - savesegment(fs, ctxt->fs); - savesegment(gs, ctxt->gs); - savesegment(ss, ctxt->ss); - - /* - * control registers - */ - ctxt->cr0 = read_cr0(); - ctxt->cr2 = read_cr2(); - ctxt->cr3 = read_cr3(); - ctxt->cr4 = read_cr4_safe(); -} - -/* Needed by apm.c */ -void save_processor_state(void) -{ - __save_processor_state(&saved_context); -} -EXPORT_SYMBOL(save_processor_state); - -static void do_fpu_end(void) -{ - /* - * Restore FPU regs if necessary. - */ - kernel_fpu_end(); -} - -static void fix_processor_context(void) -{ - int cpu = smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - - set_tss_desc(cpu, t); /* - * This just modifies memory; should not be - * necessary. But... This is necessary, because - * 386 hardware has concept of busy TSS or some - * similar stupidity. - */ - - load_TR_desc(); /* This does ltr */ - load_LDT(¤t->active_mm->context); /* This does lldt */ - - /* - * Now maybe reload the debug registers - */ - if (current->thread.debugreg7) { - set_debugreg(current->thread.debugreg0, 0); - set_debugreg(current->thread.debugreg1, 1); - set_debugreg(current->thread.debugreg2, 2); - set_debugreg(current->thread.debugreg3, 3); - /* no 4 and 5 */ - set_debugreg(current->thread.debugreg6, 6); - set_debugreg(current->thread.debugreg7, 7); - } - -} - -static void __restore_processor_state(struct saved_context *ctxt) -{ - /* - * control registers - */ - /* cr4 was introduced in the Pentium CPU */ - if (ctxt->cr4) - write_cr4(ctxt->cr4); - write_cr3(ctxt->cr3); - write_cr2(ctxt->cr2); - write_cr0(ctxt->cr0); - - /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). - */ - load_gdt(&ctxt->gdt); - load_idt(&ctxt->idt); - - /* - * segment registers - */ - loadsegment(es, ctxt->es); - loadsegment(fs, ctxt->fs); - loadsegment(gs, ctxt->gs); - loadsegment(ss, ctxt->ss); - - /* - * sysenter MSRs - */ - if (boot_cpu_has(X86_FEATURE_SEP)) - enable_sep_cpu(); - - /* - * restore XCR0 for xsave capable cpu's. - */ - if (cpu_has_xsave) - xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); - - fix_processor_context(); - do_fpu_end(); - mtrr_ap_init(); - mcheck_init(&boot_cpu_data); -} - -/* Needed by apm.c */ -void restore_processor_state(void) -{ - __restore_processor_state(&saved_context); -} -EXPORT_SYMBOL(restore_processor_state); diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 16a9020c8f1..88112b49f02 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -123,6 +123,7 @@ quiet_cmd_vdso = VDSO $@ -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv) +GCOV_PROFILE := n # # Install the unstripped copy of vdso*.so listed in $(vdso-install-y). diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 1241f118ab5..58bc00f68b1 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -338,6 +338,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) } } + current->mm->context.vdso = (void *)addr; + if (compat_uses_vma || !compat) { /* * MAYWRITE to allow gdb to COW and set breakpoints @@ -358,11 +360,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) goto up_fail; } - current->mm->context.vdso = (void *)addr; current_thread_info()->sysenter_return = VDSO32_SYMBOL(addr, SYSENTER_RETURN); up_fail: + if (ret) + current->mm->context.vdso = NULL; + up_write(&mm->mmap_sem); return ret; diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 7133cdf9098..21e1aeb9f3e 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -8,6 +8,7 @@ #include <linux/sched.h> #include <linux/init.h> #include <linux/random.h> +#include <linux/elf.h> #include <asm/vsyscall.h> #include <asm/vgtod.h> #include <asm/proto.h> @@ -115,15 +116,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) goto up_fail; } + current->mm->context.vdso = (void *)addr; + ret = install_special_mapping(mm, addr, vdso_size, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| VM_ALWAYSDUMP, vdso_pages); - if (ret) + if (ret) { + current->mm->context.vdso = NULL; goto up_fail; + } - current->mm->context.vdso = (void *)addr; up_fail: up_write(&mm->mmap_sem); return ret; diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 172438f86a0..7410640db17 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -5,6 +5,10 @@ CFLAGS_REMOVE_time.o = -pg CFLAGS_REMOVE_irq.o = -pg endif +# Make sure early boot has no stackprotector +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_enlighten.o := $(nostackp) + obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ grant-table.o suspend.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f09e8c36ee8..e90540a46a0 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -20,6 +20,7 @@ #include <linux/delay.h> #include <linux/start_kernel.h> #include <linux/sched.h> +#include <linux/kprobes.h> #include <linux/bootmem.h> #include <linux/module.h> #include <linux/mm.h> @@ -44,6 +45,7 @@ #include <asm/processor.h> #include <asm/proto.h> #include <asm/msr-index.h> +#include <asm/traps.h> #include <asm/setup.h> #include <asm/desc.h> #include <asm/pgtable.h> @@ -240,10 +242,10 @@ static unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } -void xen_leave_lazy(void) +static void xen_end_context_switch(struct task_struct *next) { - paravirt_leave_lazy(paravirt_get_lazy_mode()); xen_mc_flush(); + paravirt_end_context_switch(next); } static unsigned long xen_store_tr(void) @@ -428,11 +430,44 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, static int cvt_gate_to_trap(int vector, const gate_desc *val, struct trap_info *info) { + unsigned long addr; + if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) return 0; info->vector = vector; - info->address = gate_offset(*val); + + addr = gate_offset(*val); +#ifdef CONFIG_X86_64 + /* + * Look for known traps using IST, and substitute them + * appropriately. The debugger ones are the only ones we care + * about. Xen will handle faults like double_fault and + * machine_check, so we should never see them. Warn if + * there's an unexpected IST-using fault handler. + */ + if (addr == (unsigned long)debug) + addr = (unsigned long)xen_debug; + else if (addr == (unsigned long)int3) + addr = (unsigned long)xen_int3; + else if (addr == (unsigned long)stack_segment) + addr = (unsigned long)xen_stack_segment; + else if (addr == (unsigned long)double_fault || + addr == (unsigned long)nmi) { + /* Don't need to handle these */ + return 0; +#ifdef CONFIG_X86_MCE + } else if (addr == (unsigned long)machine_check) { + return 0; +#endif + } else { + /* Some other trap using IST? */ + if (WARN_ON(val->ist != 0)) + return 0; + } +#endif /* CONFIG_X86_64 */ + info->address = addr; + info->cs = gate_segment(*val); info->flags = val->dpl; /* interrupt gates clear IF */ @@ -623,10 +658,26 @@ static void xen_clts(void) xen_mc_issue(PARAVIRT_LAZY_CPU); } +static DEFINE_PER_CPU(unsigned long, xen_cr0_value); + +static unsigned long xen_read_cr0(void) +{ + unsigned long cr0 = percpu_read(xen_cr0_value); + + if (unlikely(cr0 == 0)) { + cr0 = native_read_cr0(); + percpu_write(xen_cr0_value, cr0); + } + + return cr0; +} + static void xen_write_cr0(unsigned long cr0) { struct multicall_space mcs; + percpu_write(xen_cr0_value, cr0); + /* Only pay attention to cr0.TS; everything else is ignored. */ mcs = xen_mc_entry(0); @@ -812,7 +863,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .clts = xen_clts, - .read_cr0 = native_read_cr0, + .read_cr0 = xen_read_cr0, .write_cr0 = xen_write_cr0, .read_cr4 = native_read_cr4, @@ -860,10 +911,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { /* Xen takes care of %gs when switching to usermode for us */ .swapgs = paravirt_nop, - .lazy_mode = { - .enter = paravirt_enter_lazy_cpu, - .leave = xen_leave_lazy, - }, + .start_context_switch = paravirt_start_context_switch, + .end_context_switch = xen_end_context_switch, }; static const struct pv_apic_ops xen_apic_ops __initdata = { @@ -925,10 +974,6 @@ asmlinkage void __init xen_start_kernel(void) xen_domain_type = XEN_PV_DOMAIN; - BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); - - xen_setup_features(); - /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops = xen_init_ops; @@ -937,8 +982,15 @@ asmlinkage void __init xen_start_kernel(void) pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; - xen_init_irq_ops(); +#ifdef CONFIG_X86_64 + /* + * Setup percpu state. We only need to do this for 64-bit + * because 32-bit already has %fs set properly. + */ + load_percpu_segment(0); +#endif + xen_init_irq_ops(); xen_init_cpuid_mask(); #ifdef CONFIG_X86_LOCAL_APIC @@ -948,6 +1000,8 @@ asmlinkage void __init xen_start_kernel(void) set_xen_basic_apic_ops(); #endif + xen_setup_features(); + if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; @@ -955,13 +1009,6 @@ asmlinkage void __init xen_start_kernel(void) machine_ops = xen_machine_ops; -#ifdef CONFIG_X86_64 - /* - * Setup percpu state. We only need to do this for 64-bit - * because 32-bit already has %fs set properly. - */ - load_percpu_segment(0); -#endif /* * The only reliable way to retain the initial address of the * percpu gdt_page is to remember it here, so we can go and diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index fba55b1a402..4ceb2858165 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -452,10 +452,6 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - /* updates to init_mm may be done without lock */ - if (mm == &init_mm) - preempt_disable(); - ADD_STATS(set_pte_at, 1); // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); ADD_STATS(set_pte_at_current, mm == current->mm); @@ -476,9 +472,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, } xen_set_pte(ptep, pteval); -out: - if (mm == &init_mm) - preempt_enable(); +out: return; } pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, @@ -1152,10 +1146,8 @@ static void drop_other_mm_ref(void *info) /* If this cpu still has a stale cr3 reference, then make sure it has been flushed. */ - if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) { + if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) load_cr3(swapper_pg_dir); - arch_flush_lazy_cpu_mode(); - } } static void xen_drop_mm_ref(struct mm_struct *mm) @@ -1168,7 +1160,6 @@ static void xen_drop_mm_ref(struct mm_struct *mm) load_cr3(swapper_pg_dir); else leave_mm(smp_processor_id()); - arch_flush_lazy_cpu_mode(); } /* Get the "official" set of cpus referring to our pagetable. */ @@ -1876,6 +1867,14 @@ __init void xen_post_allocator_init(void) xen_mark_init_mm_pinned(); } +static void xen_leave_lazy_mmu(void) +{ + preempt_disable(); + xen_mc_flush(); + paravirt_leave_lazy_mmu(); + preempt_enable(); +} + const struct pv_mmu_ops xen_mmu_ops __initdata = { .pagetable_setup_start = xen_pagetable_setup_start, .pagetable_setup_done = xen_pagetable_setup_done, @@ -1949,7 +1948,7 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = { .lazy_mode = { .enter = paravirt_enter_lazy_mmu, - .leave = xen_leave_lazy, + .leave = xen_leave_lazy_mmu, }, .set_fixmap = xen_set_fixmap, diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 15c6c68db6a..ad0047f47cd 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -61,9 +61,9 @@ char * __init xen_memory_setup(void) * - xen_start_info * See comment above "struct start_info" in <xen/interface/xen.h> */ - e820_add_region(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list, - E820_RESERVED); + reserve_early(__pa(xen_start_info->mfn_list), + __pa(xen_start_info->pt_base), + "XEN START INFO"); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index ca6596b05d5..22494fd4c9b 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -30,7 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_ident_map_ISA(void); void xen_reserve_top(void); -void xen_leave_lazy(void); void xen_post_allocator_init(void); char * __init xen_memory_setup(void); |