diff options
author | H. Peter Anvin <hpa@zytor.com> | 2010-04-29 16:53:17 -0700 |
---|---|---|
committer | H. Peter Anvin <hpa@zytor.com> | 2010-04-29 16:53:17 -0700 |
commit | d9c5841e22231e4e49fd0a1004164e6fce59b7a6 (patch) | |
tree | e1f589c46b3ff79bbe7b1b2469f6362f94576da6 /arch/x86 | |
parent | b701a47ba48b698976fb2fe05fb285b0edc1d26a (diff) | |
parent | 5967ed87ade85a421ef814296c3c7f182b08c225 (diff) |
Merge branch 'x86/asm' into x86/atomic
Merge reason:
Conflict between LOCK_PREFIX_HERE and relative alternatives
pointers
Resolved Conflicts:
arch/x86/include/asm/alternative.h
arch/x86/kernel/alternative.c
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Diffstat (limited to 'arch/x86')
298 files changed, 10290 insertions, 6602 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 55298e89157..9458685902b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -31,6 +31,7 @@ config X86 select ARCH_WANT_FRAME_POINTERS select HAVE_DMA_ATTRS select HAVE_KRETPROBES + select HAVE_OPTPROBES select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER @@ -45,10 +46,12 @@ config X86 select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS select USER_STACKTRACE_SUPPORT + select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_DMA_API_DEBUG select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA + select HAVE_KERNEL_LZO select HAVE_HW_BREAKPOINT select PERF_EVENTS select ANON_INODES @@ -99,6 +102,9 @@ config ZONE_DMA config SBUS bool +config NEED_DMA_MAP_STATE + def_bool (X86_64 || DMAR || DMA_API_DEBUG) + config GENERIC_ISA_DMA def_bool y @@ -182,6 +188,9 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_EARLY_RES + def_bool y + config HAVE_INTEL_TXT def_bool y depends on EXPERIMENTAL && DMAR && ACPI @@ -387,8 +396,12 @@ config X86_ELAN config X86_MRST bool "Moorestown MID platform" + depends on PCI + depends on PCI_GOANY depends on X86_32 depends on X86_EXTENDED_PLATFORM + depends on X86_IO_APIC + select APB_TIMER ---help--- Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin Internet Device(MID) platform. Moorestown consists of two chips: @@ -423,6 +436,7 @@ config X86_32_NON_STANDARD config X86_NUMAQ bool "NUMAQ (IBM/Sequent)" depends on X86_32_NON_STANDARD + depends on PCI select NUMA select X86_MPPARSE ---help--- @@ -567,6 +581,18 @@ config PARAVIRT_DEBUG Enable to debug paravirt_ops internals. Specifically, BUG if a paravirt_op is missing when it is called. +config NO_BOOTMEM + default y + bool "Disable Bootmem code" + ---help--- + Use early_res directly instead of bootmem before slab is ready. + - allocator (buddy) [generic] + - early allocator (bootmem) [generic] + - very early allocator (reserve_early*()) [x86] + - very very early allocator (early brk model) [x86] + So reduce one layer between early allocator to final allocator + + config MEMTEST bool "Memtest" ---help--- @@ -611,6 +637,16 @@ config HPET_EMULATE_RTC def_bool y depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) +config APB_TIMER + def_bool y if MRST + prompt "Langwell APB Timer Support" if X86_MRST + help + APB timer is the replacement for 8254, HPET on X86 MID platforms. + The APBT provides a stable time base on SMP + systems, unlike the TSC, but it is more expensive to access, + as it is off-chip. APB timers are always running regardless of CPU + C states, they are used as per CPU clockevent device when possible. + # Mark as embedded because too many people got it wrong. # The code disables itself when not needed. config DMI @@ -626,7 +662,7 @@ config GART_IOMMU bool "GART IOMMU support" if EMBEDDED default y select SWIOTLB - depends on X86_64 && PCI + depends on X86_64 && PCI && K8_NB ---help--- Support for full DMA access of devices with 32bit memory access only on systems with more than 3GB. This is usually needed for USB, @@ -988,12 +1024,6 @@ config X86_CPUID with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to /dev/cpu/31/cpuid. -config X86_CPU_DEBUG - tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support" - ---help--- - If you select this option, this will provide various x86 CPUs - information through debugfs. - choice prompt "High Memory Support" default HIGHMEM4G if !X86_NUMAQ @@ -1186,8 +1216,8 @@ config NUMA_EMU config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP - range 1 9 - default "9" if MAXSMP + range 1 10 + default "10" if MAXSMP default "6" if X86_64 default "4" if X86_NUMAQ default "3" @@ -1246,6 +1276,11 @@ config ARCH_MEMORY_PROBE def_bool X86_64 depends on MEMORY_HOTPLUG +config ILLEGAL_POINTER_VALUE + hex + default 0 if X86_32 + default 0xdead000000000000 if X86_64 + source "mm/Kconfig" config HIGHPTE @@ -2026,7 +2061,7 @@ endif # X86_32 config K8_NB def_bool y - depends on AGP_AMD64 || (X86_64 && (GART_IOMMU || (PCI && NUMA))) + depends on CPU_SUP_AMD && PCI source "drivers/pcmcia/Kconfig" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 08e442bc3ab..a19829374e6 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -319,7 +319,7 @@ config X86_L1_CACHE_SHIFT config X86_XADD def_bool y - depends on X86_32 && !M386 + depends on X86_64 || !M386 config X86_PPRO_FENCE bool "PentiumPro memory ordering errata workaround" @@ -396,7 +396,7 @@ config X86_TSC config X86_CMPXCHG64 def_bool y - depends on !M386 && !M486 + depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM # this should be set for all -march=.. options where the compiler # generates cmov. diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 78b32be55e9..0a43dc515e4 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -135,9 +135,7 @@ drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/ # suspend and hibernation support drivers-$(CONFIG_PM) += arch/x86/power/ -ifeq ($(CONFIG_X86_32),y) drivers-$(CONFIG_FB) += arch/x86/video/ -endif #### # boot loader support. Several targets are kept for legacy purposes diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f8ed0658404..fbb47daf245 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -4,11 +4,12 @@ # create a compressed vmlinux image from the original vmlinux # -targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o +targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o piggy.o KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small KBUILD_CFLAGS += $(cflags-y) KBUILD_CFLAGS += $(call cc-option,-ffreestanding) @@ -48,10 +49,13 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE $(call if_changed,bzip2) $(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzma) +$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE + $(call if_changed,lzo) suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 suffix-$(CONFIG_KERNEL_LZMA) := lzma +suffix-$(CONFIG_KERNEL_LZO) := lzo quiet_cmd_mkpiggy = MKPIGGY $@ cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 842b2a36174..51e240779a4 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -19,11 +19,6 @@ #define _ASM_X86_DESC_H 1 #endif -#ifdef CONFIG_X86_64 -#define _LINUX_STRING_H_ 1 -#define __LINUX_BITMAP_H 1 -#endif - #include <linux/linkage.h> #include <linux/screen_info.h> #include <linux/elf.h> @@ -131,8 +126,8 @@ static void error(char *m); static struct boot_params *real_mode; /* Pointer to real-mode data */ static int quiet; -static void *memset(void *s, int c, unsigned n); -void *memcpy(void *dest, const void *src, unsigned n); +void *memset(void *s, int c, size_t n); +void *memcpy(void *dest, const void *src, size_t n); static void __putstr(int, const char *); #define putstr(__x) __putstr(0, __x) @@ -162,6 +157,10 @@ static int lines, cols; #include "../../../../lib/decompress_unlzma.c" #endif +#ifdef CONFIG_KERNEL_LZO +#include "../../../../lib/decompress_unlzo.c" +#endif + static void scroll(void) { int i; @@ -181,11 +180,9 @@ static void __putstr(int error, const char *s) return; #endif -#ifdef CONFIG_X86_32 if (real_mode->screen_info.orig_video_mode == 0 && lines == 0 && cols == 0) return; -#endif x = real_mode->screen_info.orig_x; y = real_mode->screen_info.orig_y; @@ -219,7 +216,7 @@ static void __putstr(int error, const char *s) outb(0xff & (pos >> 1), vidport+1); } -static void *memset(void *s, int c, unsigned n) +void *memset(void *s, int c, size_t n) { int i; char *ss = s; @@ -229,7 +226,7 @@ static void *memset(void *s, int c, unsigned n) return s; } -void *memcpy(void *dest, const void *src, unsigned n) +void *memcpy(void *dest, const void *src, size_t n) { int i; const char *s = src; diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index 8ef60f20b37..919257f526f 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c @@ -22,7 +22,7 @@ int main(void) int i, j; const char *str; - printf("static const char x86_cap_strs[] = \n"); + printf("static const char x86_cap_strs[] =\n"); for (i = 0; i < NCAPINTS; i++) { for (j = 0; j < 32; j++) { diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index 819caa1f200..ed7aeff786b 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -42,22 +42,15 @@ static u8 vga_set_basic_mode(void) { struct biosregs ireg, oreg; u16 ax; - u8 rows; u8 mode; initregs(&ireg); + /* Query current mode */ ax = 0x0f00; intcall(0x10, &ireg, &oreg); mode = oreg.al; - set_fs(0); - rows = rdfs8(0x484); /* rows minus one */ - - if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) && - (rows == 0 || rows == 24)) - return mode; - if (mode != 3 && mode != 7) mode = 3; diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index f767164cd5d..43eda284d27 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -298,11 +298,18 @@ static void restore_screen(void) } /* Restore cursor position */ + if (saved.curx >= xs) + saved.curx = xs-1; + if (saved.cury >= ys) + saved.cury = ys-1; + initregs(&ireg); ireg.ah = 0x02; /* Set cursor position */ ireg.dh = saved.cury; ireg.dl = saved.curx; intcall(0x10, &ireg, NULL); + + store_cursor_position(); } void set_video(void) diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c index daef6cd2b45..1a8f8649c03 100644 --- a/arch/x86/crypto/fpu.c +++ b/arch/x86/crypto/fpu.c @@ -16,6 +16,7 @@ #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/slab.h> #include <asm/i387.h> struct crypto_fpu_ctx { diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S index 39b98ed2c1b..575331cb2a8 100644 --- a/arch/x86/crypto/twofish-i586-asm_32.S +++ b/arch/x86/crypto/twofish-i586-asm_32.S @@ -22,7 +22,7 @@ #include <asm/asm-offsets.h> -/* return adress at 0 */ +/* return address at 0 */ #define in_blk 12 /* input byte array address parameter*/ #define out_blk 8 /* output byte array address parameter*/ @@ -230,8 +230,8 @@ twofish_enc_blk: push %edi mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */ - add $crypto_tfm_ctx_offset, %ebp /* ctx adress */ - mov in_blk+16(%esp),%edi /* input adress in edi */ + add $crypto_tfm_ctx_offset, %ebp /* ctx address */ + mov in_blk+16(%esp),%edi /* input address in edi */ mov (%edi), %eax mov b_offset(%edi), %ebx @@ -286,8 +286,8 @@ twofish_dec_blk: mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */ - add $crypto_tfm_ctx_offset, %ebp /* ctx adress */ - mov in_blk+16(%esp),%edi /* input adress in edi */ + add $crypto_tfm_ctx_offset, %ebp /* ctx address */ + mov in_blk+16(%esp),%edi /* input address in edi */ mov (%edi), %eax mov b_offset(%edi), %ebx diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index 35974a58661..573aa102542 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -221,11 +221,11 @@ twofish_enc_blk: pushq R1 - /* %rdi contains the crypto tfm adress */ - /* %rsi contains the output adress */ - /* %rdx contains the input adress */ - add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */ - /* ctx adress is moved to free one non-rex register + /* %rdi contains the crypto tfm address */ + /* %rsi contains the output address */ + /* %rdx contains the input address */ + add $crypto_tfm_ctx_offset, %rdi /* set ctx address */ + /* ctx address is moved to free one non-rex register as target for the 8bit high operations */ mov %rdi, %r11 @@ -274,11 +274,11 @@ twofish_enc_blk: twofish_dec_blk: pushq R1 - /* %rdi contains the crypto tfm adress */ - /* %rsi contains the output adress */ - /* %rdx contains the input adress */ - add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */ - /* ctx adress is moved to free one non-rex register + /* %rdi contains the crypto tfm address */ + /* %rsi contains the output address */ + /* %rdx contains the input address */ + add $crypto_tfm_ctx_offset, %rdi /* set ctx address */ + /* ctx address is moved to free one non-rex register as target for the 8bit high operations */ mov %rdi, %r11 diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 2a4d073d2cf..0350311906a 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -21,7 +21,6 @@ #include <linux/fcntl.h> #include <linux/ptrace.h> #include <linux/user.h> -#include <linux/slab.h> #include <linux/binfmts.h> #include <linux/personality.h> #include <linux/init.h> @@ -297,7 +296,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) * size limits imposed on them by creating programs with large * arrays in the data or bss. */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + rlim = rlimit(RLIMIT_DATA); if (rlim >= RLIM_INFINITY) rlim = ~0; if (ex.a_data + ex.a_bss > rlim) @@ -308,14 +307,15 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval) return retval; - regs->cs = __USER32_CS; - regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = - regs->r13 = regs->r14 = regs->r15 = 0; - /* OK, This is the point of no return */ set_personality(PER_LINUX); set_thread_flag(TIF_IA32); - clear_thread_flag(TIF_ABI_PENDING); + + setup_new_exec(bprm); + + regs->cs = __USER32_CS; + regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = + regs->r13 = regs->r14 = regs->r15 = 0; current->mm->end_code = ex.a_text + (current->mm->start_code = N_TXTADDR(ex)); @@ -326,7 +326,6 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) current->mm->free_area_cache = TASK_UNMAPPED_BASE; current->mm->cached_hole_size = 0; - current->mm->mmap = NULL; install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 53147ad85b9..e790bc1fbfa 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -563,7 +563,7 @@ ia32_sys_call_table: .quad quiet_ni_syscall /* old mpx syscall holder */ .quad sys_setpgid .quad quiet_ni_syscall /* old ulimit syscall holder */ - .quad sys32_olduname + .quad sys_olduname .quad sys_umask /* 60 */ .quad sys_chroot .quad compat_sys_ustat @@ -586,7 +586,7 @@ ia32_sys_call_table: .quad compat_sys_settimeofday .quad sys_getgroups16 /* 80 */ .quad sys_setgroups16 - .quad sys32_old_select + .quad compat_sys_old_select .quad sys_symlink .quad sys_lstat .quad sys_readlink /* 85 */ @@ -613,7 +613,7 @@ ia32_sys_call_table: .quad compat_sys_newstat .quad compat_sys_newlstat .quad compat_sys_newfstat - .quad sys32_uname + .quad sys_uname .quad stub32_iopl /* 110 */ .quad sys_vhangup .quad quiet_ni_syscall /* old "idle" system call */ @@ -626,7 +626,7 @@ ia32_sys_call_table: .quad stub32_sigreturn .quad stub32_clone /* 120 */ .quad sys_setdomainname - .quad sys_uname + .quad sys_newuname .quad sys_modify_ldt .quad compat_sys_adjtimex .quad sys32_mprotect /* 125 */ diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 422572c7792..626be156d88 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -40,6 +40,7 @@ #include <linux/ptrace.h> #include <linux/highuid.h> #include <linux/sysctl.h> +#include <linux/slab.h> #include <asm/mman.h> #include <asm/types.h> #include <asm/uaccess.h> @@ -143,7 +144,7 @@ asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename, * block for parameter passing.. */ -struct mmap_arg_struct { +struct mmap_arg_struct32 { unsigned int addr; unsigned int len; unsigned int prot; @@ -152,9 +153,9 @@ struct mmap_arg_struct { unsigned int offset; }; -asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) +asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *arg) { - struct mmap_arg_struct a; + struct mmap_arg_struct32 a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; @@ -332,24 +333,6 @@ asmlinkage long sys32_alarm(unsigned int seconds) return alarm_setitimer(seconds); } -struct sel_arg_struct { - unsigned int n; - unsigned int inp; - unsigned int outp; - unsigned int exp; - unsigned int tvp; -}; - -asmlinkage long sys32_old_select(struct sel_arg_struct __user *arg) -{ - struct sel_arg_struct a; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), - compat_ptr(a.exp), compat_ptr(a.tvp)); -} - asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options) { @@ -466,58 +449,6 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd, return ret; } -asmlinkage long sys32_olduname(struct oldold_utsname __user *name) -{ - char *arch = "x86_64"; - int err; - - if (!name) - return -EFAULT; - if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) - return -EFAULT; - - down_read(&uts_sem); - - err = __copy_to_user(&name->sysname, &utsname()->sysname, - __OLD_UTS_LEN); - err |= __put_user(0, name->sysname+__OLD_UTS_LEN); - err |= __copy_to_user(&name->nodename, &utsname()->nodename, - __OLD_UTS_LEN); - err |= __put_user(0, name->nodename+__OLD_UTS_LEN); - err |= __copy_to_user(&name->release, &utsname()->release, - __OLD_UTS_LEN); - err |= __put_user(0, name->release+__OLD_UTS_LEN); - err |= __copy_to_user(&name->version, &utsname()->version, - __OLD_UTS_LEN); - err |= __put_user(0, name->version+__OLD_UTS_LEN); - - if (personality(current->personality) == PER_LINUX32) - arch = "i686"; - - err |= __copy_to_user(&name->machine, arch, strlen(arch) + 1); - - up_read(&uts_sem); - - err = err ? -EFAULT : 0; - - return err; -} - -long sys32_uname(struct old_utsname __user *name) -{ - int err; - - if (!name) - return -EFAULT; - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof(*name)); - up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) - err |= copy_to_user(&name->machine, "i686", 5); - - return err ? -EFAULT : 0; -} - asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, compat_uptr_t __user *envp, struct pt_regs *regs) { diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 9f828f87ca3..493092efaa3 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -11,6 +11,7 @@ header-y += sigcontext32.h header-y += ucontext.h header-y += processor-flags.h header-y += hw_breakpoint.h +header-y += hyperv.h unifdef-y += e820.h unifdef-y += ist.h diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index b97f786a48d..a63a68be1cc 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -6,8 +6,8 @@ .macro LOCK_PREFIX 1: lock .section .smp_locks,"a" - _ASM_ALIGN - _ASM_PTR 1b + .balign 4 + .long 1b - . .previous .endm #else diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index e29a6c9bba0..92a9033c14d 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -30,8 +30,8 @@ #ifdef CONFIG_SMP #define LOCK_PREFIX_HERE \ ".section .smp_locks,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR "671f\n" /* address */ \ + ".balign 4\n" \ + ".long 671f - .\n" /* offset */ \ ".previous\n" \ "671:" @@ -68,12 +68,17 @@ extern void alternatives_smp_module_add(struct module *mod, char *name, void *text, void *text_end); extern void alternatives_smp_module_del(struct module *mod); extern void alternatives_smp_switch(int smp); +extern int alternatives_text_reserved(void *start, void *end); #else static inline void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) {} static inline void alternatives_smp_module_del(struct module *mod) {} static inline void alternatives_smp_switch(int smp) {} +static inline int alternatives_text_reserved(void *start, void *end) +{ + return 0; +} #endif /* CONFIG_SMP */ /* alternative assembly primitive: */ @@ -163,10 +168,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. * More care must be taken when modifying code in the SMP case because of - * Intel's errata. + * Intel's errata. text_poke_smp() takes care that errata, but still + * doesn't support NMI/MCE handler code modifying. * On the local CPU you need to be protected again NMI or MCE handlers seeing an * inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); +extern void *text_poke_smp(void *addr, const void *opcode, size_t len); #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h index 4d817f9e6e7..d2544f1d705 100644 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -31,6 +31,7 @@ extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); extern int amd_iommu_init_devices(void); extern void amd_iommu_uninit_devices(void); extern void amd_iommu_init_notifier(void); +extern void amd_iommu_init_api(void); #ifndef CONFIG_AMD_IOMMU_STATS static inline void amd_iommu_stats_init(void) { } diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index ba19ad4c47d..86a0ff0aeac 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -21,6 +21,7 @@ #define _ASM_X86_AMD_IOMMU_TYPES_H #include <linux/types.h> +#include <linux/mutex.h> #include <linux/list.h> #include <linux/spinlock.h> @@ -140,6 +141,7 @@ /* constants to configure the command buffer */ #define CMD_BUFFER_SIZE 8192 +#define CMD_BUFFER_UNINITIALIZED 1 #define CMD_BUFFER_ENTRIES 512 #define MMIO_CMD_SIZE_SHIFT 56 #define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) @@ -237,6 +239,7 @@ struct protection_domain { struct list_head list; /* for list of all protection domains */ struct list_head dev_list; /* List of all devices in this domain */ spinlock_t lock; /* mostly used to lock the page table*/ + struct mutex api_lock; /* protect page tables in the iommu-api path */ u16 id; /* the domain id written to the device table */ int mode; /* paging mode (0-6 levels) */ u64 *pt_root; /* page table root pointer */ diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h new file mode 100644 index 00000000000..c74a2eebe57 --- /dev/null +++ b/arch/x86/include/asm/apb_timer.h @@ -0,0 +1,70 @@ +/* + * apb_timer.h: Driver for Langwell APB timer based on Synopsis DesignWare + * + * (C) Copyright 2009 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Note: + */ + +#ifndef ASM_X86_APBT_H +#define ASM_X86_APBT_H +#include <linux/sfi.h> + +#ifdef CONFIG_APB_TIMER + +/* Langwell DW APB timer registers */ +#define APBTMR_N_LOAD_COUNT 0x00 +#define APBTMR_N_CURRENT_VALUE 0x04 +#define APBTMR_N_CONTROL 0x08 +#define APBTMR_N_EOI 0x0c +#define APBTMR_N_INT_STATUS 0x10 + +#define APBTMRS_INT_STATUS 0xa0 +#define APBTMRS_EOI 0xa4 +#define APBTMRS_RAW_INT_STATUS 0xa8 +#define APBTMRS_COMP_VERSION 0xac +#define APBTMRS_REG_SIZE 0x14 + +/* register bits */ +#define APBTMR_CONTROL_ENABLE (1<<0) +#define APBTMR_CONTROL_MODE_PERIODIC (1<<1) /*1: periodic 0:free running */ +#define APBTMR_CONTROL_INT (1<<2) + +/* default memory mapped register base */ +#define LNW_SCU_ADDR 0xFF100000 +#define LNW_EXT_TIMER_OFFSET 0x1B800 +#define APBT_DEFAULT_BASE (LNW_SCU_ADDR+LNW_EXT_TIMER_OFFSET) +#define LNW_EXT_TIMER_PGOFFSET 0x800 + +/* APBT clock speed range from PCLK to fabric base, 25-100MHz */ +#define APBT_MAX_FREQ 50 +#define APBT_MIN_FREQ 1 +#define APBT_MMAP_SIZE 1024 + +#define APBT_DEV_USED 1 + +extern void apbt_time_init(void); +extern struct clock_event_device *global_clock_event; +extern unsigned long apbt_quick_calibrate(void); +extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); +extern void apbt_setup_secondary_clock(void); +extern unsigned int boot_cpu_id; +extern int disable_apbt_percpu; + +extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); +extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); +extern int sfi_mtimer_num; + +#else /* CONFIG_APB_TIMER */ + +static inline unsigned long apbt_quick_calibrate(void) {return 0; } +static inline void apbt_time_init(void) {return 0; } + +#endif +#endif /* ASM_X86_APBT_H */ diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 9a9c7bdc923..306160e58b4 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -8,7 +8,8 @@ #include <linux/sched.h> #include <asm/user32.h> -#define COMPAT_USER_HZ 100 +#define COMPAT_USER_HZ 100 +#define COMPAT_UTS_MACHINE "i686\0\0" typedef u32 compat_size_t; typedef s32 compat_ssize_t; diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h deleted file mode 100644 index d96c1ee3a95..00000000000 --- a/arch/x86/include/asm/cpu_debug.h +++ /dev/null @@ -1,127 +0,0 @@ -#ifndef _ASM_X86_CPU_DEBUG_H -#define _ASM_X86_CPU_DEBUG_H - -/* - * CPU x86 architecture debug - * - * Copyright(C) 2009 Jaswinder Singh Rajput - */ - -/* Register flags */ -enum cpu_debug_bit { -/* Model Specific Registers (MSRs) */ - CPU_MC_BIT, /* Machine Check */ - CPU_MONITOR_BIT, /* Monitor */ - CPU_TIME_BIT, /* Time */ - CPU_PMC_BIT, /* Performance Monitor */ - CPU_PLATFORM_BIT, /* Platform */ - CPU_APIC_BIT, /* APIC */ - CPU_POWERON_BIT, /* Power-on */ - CPU_CONTROL_BIT, /* Control */ - CPU_FEATURES_BIT, /* Features control */ - CPU_LBRANCH_BIT, /* Last Branch */ - CPU_BIOS_BIT, /* BIOS */ - CPU_FREQ_BIT, /* Frequency */ - CPU_MTTR_BIT, /* MTRR */ - CPU_PERF_BIT, /* Performance */ - CPU_CACHE_BIT, /* Cache */ - CPU_SYSENTER_BIT, /* Sysenter */ - CPU_THERM_BIT, /* Thermal */ - CPU_MISC_BIT, /* Miscellaneous */ - CPU_DEBUG_BIT, /* Debug */ - CPU_PAT_BIT, /* PAT */ - CPU_VMX_BIT, /* VMX */ - CPU_CALL_BIT, /* System Call */ - CPU_BASE_BIT, /* BASE Address */ - CPU_VER_BIT, /* Version ID */ - CPU_CONF_BIT, /* Configuration */ - CPU_SMM_BIT, /* System mgmt mode */ - CPU_SVM_BIT, /*Secure Virtual Machine*/ - CPU_OSVM_BIT, /* OS-Visible Workaround*/ -/* Standard Registers */ - CPU_TSS_BIT, /* Task Stack Segment */ - CPU_CR_BIT, /* Control Registers */ - CPU_DT_BIT, /* Descriptor Table */ -/* End of Registers flags */ - CPU_REG_ALL_BIT, /* Select all Registers */ -}; - -#define CPU_REG_ALL (~0) /* Select all Registers */ - -#define CPU_MC (1 << CPU_MC_BIT) -#define CPU_MONITOR (1 << CPU_MONITOR_BIT) -#define CPU_TIME (1 << CPU_TIME_BIT) -#define CPU_PMC (1 << CPU_PMC_BIT) -#define CPU_PLATFORM (1 << CPU_PLATFORM_BIT) -#define CPU_APIC (1 << CPU_APIC_BIT) -#define CPU_POWERON (1 << CPU_POWERON_BIT) -#define CPU_CONTROL (1 << CPU_CONTROL_BIT) -#define CPU_FEATURES (1 << CPU_FEATURES_BIT) -#define CPU_LBRANCH (1 << CPU_LBRANCH_BIT) -#define CPU_BIOS (1 << CPU_BIOS_BIT) -#define CPU_FREQ (1 << CPU_FREQ_BIT) -#define CPU_MTRR (1 << CPU_MTTR_BIT) -#define CPU_PERF (1 << CPU_PERF_BIT) -#define CPU_CACHE (1 << CPU_CACHE_BIT) -#define CPU_SYSENTER (1 << CPU_SYSENTER_BIT) -#define CPU_THERM (1 << CPU_THERM_BIT) -#define CPU_MISC (1 << CPU_MISC_BIT) -#define CPU_DEBUG (1 << CPU_DEBUG_BIT) -#define CPU_PAT (1 << CPU_PAT_BIT) -#define CPU_VMX (1 << CPU_VMX_BIT) -#define CPU_CALL (1 << CPU_CALL_BIT) -#define CPU_BASE (1 << CPU_BASE_BIT) -#define CPU_VER (1 << CPU_VER_BIT) -#define CPU_CONF (1 << CPU_CONF_BIT) -#define CPU_SMM (1 << CPU_SMM_BIT) -#define CPU_SVM (1 << CPU_SVM_BIT) -#define CPU_OSVM (1 << CPU_OSVM_BIT) -#define CPU_TSS (1 << CPU_TSS_BIT) -#define CPU_CR (1 << CPU_CR_BIT) -#define CPU_DT (1 << CPU_DT_BIT) - -/* Register file flags */ -enum cpu_file_bit { - CPU_INDEX_BIT, /* index */ - CPU_VALUE_BIT, /* value */ -}; - -#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT) - -#define MAX_CPU_FILES 512 - -struct cpu_private { - unsigned cpu; - unsigned type; - unsigned reg; - unsigned file; -}; - -struct cpu_debug_base { - char *name; /* Register name */ - unsigned flag; /* Register flag */ - unsigned write; /* Register write flag */ -}; - -/* - * Currently it looks similar to cpu_debug_base but once we add more files - * cpu_file_base will go in different direction - */ -struct cpu_file_base { - char *name; /* Register file name */ - unsigned flag; /* Register file flag */ - unsigned write; /* Register write flag */ -}; - -struct cpu_cpuX_base { - struct dentry *dentry; /* Register dentry */ - int init; /* Register index file */ -}; - -struct cpu_debug_range { - unsigned min; /* Register range min */ - unsigned max; /* Register range max */ - unsigned flag; /* Supported flags */ -}; - -#endif /* _ASM_X86_CPU_DEBUG_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 637e1ec963c..0cd82d06861 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -168,6 +168,10 @@ #define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ #define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ #define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ +#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */ +#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */ +#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */ +#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 8240f76b531..b81002f2361 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -14,6 +14,9 @@ which debugging register was responsible for the trap. The other bits are either reserved or not of interest to us. */ +/* Define reserved bits in DR6 which are always set to 1 */ +#define DR6_RESERVED (0xFFFF0FF0) + #define DR_TRAP0 (0x1) /* db0 */ #define DR_TRAP1 (0x2) /* db1 */ #define DR_TRAP2 (0x4) /* db2 */ diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 761249e396f..0e22296790d 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -111,11 +111,8 @@ extern unsigned long end_user_pfn; extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); -extern void reserve_early(u64 start, u64 end, char *name); -extern void reserve_early_overlap_ok(u64 start, u64 end, char *name); -extern void free_early(u64 start, u64 end); -extern void early_res_to_bootmem(u64 start, u64 end); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); +#include <linux/early_res.h> extern unsigned long e820_end_of_ram_pfn(void); extern unsigned long e820_end_of_low_ram_pfn(void); diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index b4501ee223a..f2ad2163109 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -170,10 +170,7 @@ static inline void elf_common_init(struct thread_struct *t, } #define ELF_PLAT_INIT(_r, load_addr) \ -do { \ - elf_common_init(¤t->thread, _r, 0); \ - clear_thread_flag(TIF_IA32); \ -} while (0) + elf_common_init(¤t->thread, _r, 0) #define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ elf_common_init(¤t->thread, regs, __USER_DS) @@ -181,14 +178,8 @@ do { \ void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); #define compat_start_thread start_thread_ia32 -#define COMPAT_SET_PERSONALITY(ex) \ -do { \ - if (test_thread_flag(TIF_IA32)) \ - clear_thread_flag(TIF_ABI_PENDING); \ - else \ - set_thread_flag(TIF_ABI_PENDING); \ - current->personality |= force_personality32; \ -} while (0) +void set_personality_ia32(void); +#define COMPAT_SET_PERSONALITY(ex) set_personality_ia32() #define COMPAT_ELF_PLATFORM ("i686") diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h index 53018464aea..2519d0679d9 100644 --- a/arch/x86/include/asm/fb.h +++ b/arch/x86/include/asm/fb.h @@ -12,10 +12,6 @@ static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma, pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; } -#ifdef CONFIG_X86_32 extern int fb_is_primary_device(struct fb_info *info); -#else -static inline int fb_is_primary_device(struct fb_info *info) { return 0; } -#endif #endif /* _ASM_X86_FB_H */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 635f03bb499..d07b44f7d1d 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -82,6 +82,9 @@ enum fixed_addresses { #endif FIX_DBGP_BASE, FIX_EARLYCON_MEM_BASE, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ #endif @@ -132,9 +135,6 @@ enum fixed_addresses { (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1)) : __end_of_permanent_fixed_addresses, FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - FIX_OHCI1394_BASE, -#endif #ifdef CONFIG_X86_32 FIX_WP_TEST, #endif diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 0f8576427cf..aeab29aee61 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -35,7 +35,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); #define __ARCH_IRQ_STAT -#define inc_irq_stat(member) percpu_add(irq_stat.member, 1) +#define inc_irq_stat(member) percpu_inc(irq_stat.member) #define local_softirq_pending() percpu_read(irq_stat.__softirq_pending) diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 014c2b85ae4..a726650fc80 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -66,10 +66,6 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); struct page *kmap_atomic_to_page(void *ptr); -#ifndef CONFIG_PARAVIRT -#define kmap_atomic_pte(page, type) kmap_atomic(page, type) -#endif - #define flush_cache_kmaps() do { } while (0) extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 5d89fd2a369..1d5c08a1bdf 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -67,6 +67,7 @@ extern unsigned long hpet_address; extern unsigned long force_hpet_address; extern u8 hpet_blockid; extern int hpet_force_user; +extern u8 hpet_msi_disable; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern void hpet_disable(void); diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index 0675a7c4c20..2a1bd8f4f23 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -10,7 +10,6 @@ * (display/resolving) */ struct arch_hw_breakpoint { - char *name; /* Contains name of the symbol to set bkpt */ unsigned long address; u8 len; u8 type; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index eeac829a0f4..46c0fe05f23 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -53,13 +53,6 @@ extern void threshold_interrupt(void); extern void call_function_interrupt(void); extern void call_function_single_interrupt(void); -/* PIC specific functions */ -extern void disable_8259A_irq(unsigned int irq); -extern void enable_8259A_irq(unsigned int irq); -extern int i8259A_irq_pending(unsigned int irq); -extern void make_8259A_irq(unsigned int irq); -extern void init_8259A(int aeoi); - /* IOAPIC */ #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs)) extern unsigned long io_apic_irqs; @@ -140,6 +133,7 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); +extern void setup_vector_irq(int cpu); #ifdef CONFIG_X86_IO_APIC extern void lock_vector_lock(void); diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h new file mode 100644 index 00000000000..e153a2b3889 --- /dev/null +++ b/arch/x86/include/asm/hyperv.h @@ -0,0 +1,186 @@ +#ifndef _ASM_X86_KVM_HYPERV_H +#define _ASM_X86_KVM_HYPERV_H + +#include <linux/types.h> + +/* + * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent + * is set by CPUID(HvCpuIdFunctionVersionAndFeatures). + */ +#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000 +#define HYPERV_CPUID_INTERFACE 0x40000001 +#define HYPERV_CPUID_VERSION 0x40000002 +#define HYPERV_CPUID_FEATURES 0x40000003 +#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 +#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 + +/* + * Feature identification. EAX indicates which features are available + * to the partition based upon the current partition privileges. + */ + +/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ +#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) +/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ +#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) +/* + * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM + * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available + */ +#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2) +/* + * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through + * HV_X64_MSR_STIMER3_COUNT) available + */ +#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3) +/* + * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) + * are available + */ +#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4) +/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ +#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5) +/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ +#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6) +/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ +#define HV_X64_MSR_RESET_AVAILABLE (1 << 7) + /* + * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, + * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, + * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available + */ +#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8) + +/* + * Feature identification: EBX indicates which flags were specified at + * partition creation. The format is the same as the partition creation + * flag structure defined in section Partition Creation Flags. + */ +#define HV_X64_CREATE_PARTITIONS (1 << 0) +#define HV_X64_ACCESS_PARTITION_ID (1 << 1) +#define HV_X64_ACCESS_MEMORY_POOL (1 << 2) +#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3) +#define HV_X64_POST_MESSAGES (1 << 4) +#define HV_X64_SIGNAL_EVENTS (1 << 5) +#define HV_X64_CREATE_PORT (1 << 6) +#define HV_X64_CONNECT_PORT (1 << 7) +#define HV_X64_ACCESS_STATS (1 << 8) +#define HV_X64_DEBUGGING (1 << 11) +#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12) +#define HV_X64_CONFIGURE_PROFILER (1 << 13) + +/* + * Feature identification. EDX indicates which miscellaneous features + * are available to the partition. + */ +/* The MWAIT instruction is available (per section MONITOR / MWAIT) */ +#define HV_X64_MWAIT_AVAILABLE (1 << 0) +/* Guest debugging support is available */ +#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1) +/* Performance Monitor support is available*/ +#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2) +/* Support for physical CPU dynamic partitioning events is available*/ +#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3) +/* + * Support for passing hypercall input parameter block via XMM + * registers is available + */ +#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4) +/* Support for a virtual guest idle state is available */ +#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5) + +/* + * Implementation recommendations. Indicates which behaviors the hypervisor + * recommends the OS implement for optimal performance. + */ + /* + * Recommend using hypercall for address space switches rather + * than MOV to CR3 instruction + */ +#define HV_X64_MWAIT_RECOMMENDED (1 << 0) +/* Recommend using hypercall for local TLB flushes rather + * than INVLPG or MOV to CR3 instructions */ +#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1) +/* + * Recommend using hypercall for remote TLB flushes rather + * than inter-processor interrupts + */ +#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2) +/* + * Recommend using MSRs for accessing APIC registers + * EOI, ICR and TPR rather than their memory-mapped counterparts + */ +#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3) +/* Recommend using the hypervisor-provided MSR to initiate a system RESET */ +#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4) +/* + * Recommend using relaxed timing for this partition. If used, + * the VM should disable any watchdog timeouts that rely on the + * timely delivery of external interrupts + */ +#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) + +/* MSR used to identify the guest OS. */ +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 + +/* MSR used to setup pages used to communicate with the hypervisor. */ +#define HV_X64_MSR_HYPERCALL 0x40000001 + +/* MSR used to provide vcpu index */ +#define HV_X64_MSR_VP_INDEX 0x40000002 + +/* Define the virtual APIC registers */ +#define HV_X64_MSR_EOI 0x40000070 +#define HV_X64_MSR_ICR 0x40000071 +#define HV_X64_MSR_TPR 0x40000072 +#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073 + +/* Define synthetic interrupt controller model specific registers. */ +#define HV_X64_MSR_SCONTROL 0x40000080 +#define HV_X64_MSR_SVERSION 0x40000081 +#define HV_X64_MSR_SIEFP 0x40000082 +#define HV_X64_MSR_SIMP 0x40000083 +#define HV_X64_MSR_EOM 0x40000084 +#define HV_X64_MSR_SINT0 0x40000090 +#define HV_X64_MSR_SINT1 0x40000091 +#define HV_X64_MSR_SINT2 0x40000092 +#define HV_X64_MSR_SINT3 0x40000093 +#define HV_X64_MSR_SINT4 0x40000094 +#define HV_X64_MSR_SINT5 0x40000095 +#define HV_X64_MSR_SINT6 0x40000096 +#define HV_X64_MSR_SINT7 0x40000097 +#define HV_X64_MSR_SINT8 0x40000098 +#define HV_X64_MSR_SINT9 0x40000099 +#define HV_X64_MSR_SINT10 0x4000009A +#define HV_X64_MSR_SINT11 0x4000009B +#define HV_X64_MSR_SINT12 0x4000009C +#define HV_X64_MSR_SINT13 0x4000009D +#define HV_X64_MSR_SINT14 0x4000009E +#define HV_X64_MSR_SINT15 0x4000009F + + +#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) + +/* Declare the various hypercall operations. */ +#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008 + +#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) + +#define HV_PROCESSOR_POWER_STATE_C0 0 +#define HV_PROCESSOR_POWER_STATE_C1 1 +#define HV_PROCESSOR_POWER_STATE_C2 2 +#define HV_PROCESSOR_POWER_STATE_C3 3 + +/* hypercall status code */ +#define HV_STATUS_SUCCESS 0 +#define HV_STATUS_INVALID_HYPERCALL_CODE 2 +#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 +#define HV_STATUS_INVALID_ALIGNMENT 4 + +#endif diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index ebfb8a9e11f..da293092450 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -33,8 +33,16 @@ extern void init_thread_xstate(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern user_regset_active_fn fpregs_active, xfpregs_active; -extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get; -extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set; +extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, + xstateregs_get; +extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, + xstateregs_set; + +/* + * xstateregs_active == fpregs_active. Please refer to the comment + * at the definition of fpregs_active. + */ +#define xstateregs_active fpregs_active extern struct _fpx_sw_bytes fx_sw_reserved; #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 58d7091eeb1..1655147646a 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -24,12 +24,7 @@ extern unsigned int cached_irq_mask; #define SLAVE_ICW4_DEFAULT 0x01 #define PIC_ICW4_AEOI 2 -extern spinlock_t i8259A_lock; - -extern void init_8259A(int auto_eoi); -extern void enable_8259A_irq(unsigned int irq); -extern void disable_8259A_irq(unsigned int irq); -extern unsigned int startup_8259A_irq(unsigned int irq); +extern raw_spinlock_t i8259A_lock; /* the PIC may need a careful delay on some platforms, hence specific calls */ static inline unsigned char inb_pic(unsigned int port) @@ -57,7 +52,17 @@ static inline void outb_pic(unsigned char value, unsigned int port) extern struct irq_chip i8259A_chip; -extern void mask_8259A(void); -extern void unmask_8259A(void); +struct legacy_pic { + int nr_legacy_irqs; + struct irq_chip *chip; + void (*mask_all)(void); + void (*restore_mask)(void); + void (*init)(int auto_eoi); + int (*irq_pending)(unsigned int irq); + void (*make_irq)(unsigned int irq); +}; + +extern struct legacy_pic *legacy_pic; +extern struct legacy_pic null_legacy_pic; #endif /* _ASM_X86_I8259_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 73739322b6d..a1dcfa3ab17 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -1,8 +1,42 @@ #ifndef _ASM_X86_IO_H #define _ASM_X86_IO_H +/* + * This file contains the definitions for the x86 IO instructions + * inb/inw/inl/outb/outw/outl and the "string versions" of the same + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" + * versions of the single-IO instructions (inb_p/inw_p/..). + * + * This file is not meant to be obfuscating: it's just complicated + * to (a) handle it all in a way that makes gcc able to optimize it + * as well as possible and (b) trying to avoid writing the same thing + * over and over again with slight variations and possibly making a + * mistake somewhere. + */ + +/* + * Thanks to James van Artsdalen for a better timing-fix than + * the two short jumps: using outb's to a nonexistent port seems + * to guarantee better timings even on fast machines. + * + * On the other hand, I'd like to be sure of a non-existent port: + * I feel a bit unsafe about using 0x80 (should be safe, though) + * + * Linus + */ + + /* + * Bit simplified and optimized by Jan Hubicka + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. + * + * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, + * isa_read[wl] and isa_write[wl] fixed + * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> + */ + #define ARCH_HAS_IOREMAP_WC +#include <linux/string.h> #include <linux/compiler.h> #include <asm-generic/int-ll64.h> #include <asm/page.h> @@ -173,11 +207,126 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) extern void iounmap(volatile void __iomem *addr); -#ifdef CONFIG_X86_32 -# include "io_32.h" +#ifdef __KERNEL__ + +#include <asm-generic/iomap.h> + +#include <linux/vmalloc.h> + +/* + * Convert a virtual cached pointer to an uncached pointer + */ +#define xlate_dev_kmem_ptr(p) p + +static inline void +memset_io(volatile void __iomem *addr, unsigned char val, size_t count) +{ + memset((void __force *)addr, val, count); +} + +static inline void +memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) +{ + memcpy(dst, (const void __force *)src, count); +} + +static inline void +memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) +{ + memcpy((void __force *)dst, src, count); +} + +/* + * ISA space is 'always mapped' on a typical x86 system, no need to + * explicitly ioremap() it. The fact that the ISA IO space is mapped + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values + * are physical addresses. The following constant pointer can be + * used as the IO-area pointer (it can be iounmapped as well, so the + * analogy with PCI is quite large): + */ +#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) + +/* + * Cache management + * + * This needed for two cases + * 1. Out of order aware processors + * 2. Accidentally out of order processors (PPro errata #51) + */ + +static inline void flush_write_buffers(void) +{ +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) + asm volatile("lock; addl $0,0(%%esp)": : :"memory"); +#endif +} + +#endif /* __KERNEL__ */ + +extern void native_io_delay(void); + +extern int io_delay_type; +extern void io_delay_init(void); + +#if defined(CONFIG_PARAVIRT) +#include <asm/paravirt.h> #else -# include "io_64.h" + +static inline void slow_down_io(void) +{ + native_io_delay(); +#ifdef REALLY_SLOW_IO + native_io_delay(); + native_io_delay(); + native_io_delay(); #endif +} + +#endif + +#define BUILDIO(bwl, bw, type) \ +static inline void out##bwl(unsigned type value, int port) \ +{ \ + asm volatile("out" #bwl " %" #bw "0, %w1" \ + : : "a"(value), "Nd"(port)); \ +} \ + \ +static inline unsigned type in##bwl(int port) \ +{ \ + unsigned type value; \ + asm volatile("in" #bwl " %w1, %" #bw "0" \ + : "=a"(value) : "Nd"(port)); \ + return value; \ +} \ + \ +static inline void out##bwl##_p(unsigned type value, int port) \ +{ \ + out##bwl(value, port); \ + slow_down_io(); \ +} \ + \ +static inline unsigned type in##bwl##_p(int port) \ +{ \ + unsigned type value = in##bwl(port); \ + slow_down_io(); \ + return value; \ +} \ + \ +static inline void outs##bwl(int port, const void *addr, unsigned long count) \ +{ \ + asm volatile("rep; outs" #bwl \ + : "+S"(addr), "+c"(count) : "d"(port)); \ +} \ + \ +static inline void ins##bwl(int port, void *addr, unsigned long count) \ +{ \ + asm volatile("rep; ins" #bwl \ + : "+D"(addr), "+c"(count) : "d"(port)); \ +} + +BUILDIO(b, b, char) +BUILDIO(w, w, short) +BUILDIO(l, , int) extern void *xlate_dev_mem_ptr(unsigned long phys); extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h deleted file mode 100644 index a299900f592..00000000000 --- a/arch/x86/include/asm/io_32.h +++ /dev/null @@ -1,196 +0,0 @@ -#ifndef _ASM_X86_IO_32_H -#define _ASM_X86_IO_32_H - -#include <linux/string.h> -#include <linux/compiler.h> - -/* - * This file contains the definitions for the x86 IO instructions - * inb/inw/inl/outb/outw/outl and the "string versions" of the same - * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" - * versions of the single-IO instructions (inb_p/inw_p/..). - * - * This file is not meant to be obfuscating: it's just complicated - * to (a) handle it all in a way that makes gcc able to optimize it - * as well as possible and (b) trying to avoid writing the same thing - * over and over again with slight variations and possibly making a - * mistake somewhere. - */ - -/* - * Thanks to James van Artsdalen for a better timing-fix than - * the two short jumps: using outb's to a nonexistent port seems - * to guarantee better timings even on fast machines. - * - * On the other hand, I'd like to be sure of a non-existent port: - * I feel a bit unsafe about using 0x80 (should be safe, though) - * - * Linus - */ - - /* - * Bit simplified and optimized by Jan Hubicka - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. - * - * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, - * isa_read[wl] and isa_write[wl] fixed - * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#define XQUAD_PORTIO_BASE 0xfe400000 -#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ - -#ifdef __KERNEL__ - -#include <asm-generic/iomap.h> - -#include <linux/vmalloc.h> - -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - -static inline void -memset_io(volatile void __iomem *addr, unsigned char val, int count) -{ - memset((void __force *)addr, val, count); -} - -static inline void -memcpy_fromio(void *dst, const volatile void __iomem *src, int count) -{ - __memcpy(dst, (const void __force *)src, count); -} - -static inline void -memcpy_toio(volatile void __iomem *dst, const void *src, int count) -{ - __memcpy((void __force *)dst, src, count); -} - -/* - * ISA space is 'always mapped' on a typical x86 system, no need to - * explicitly ioremap() it. The fact that the ISA IO space is mapped - * to PAGE_OFFSET is pure coincidence - it does not mean ISA values - * are physical addresses. The following constant pointer can be - * used as the IO-area pointer (it can be iounmapped as well, so the - * analogy with PCI is quite large): - */ -#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) - -/* - * Cache management - * - * This needed for two cases - * 1. Out of order aware processors - * 2. Accidentally out of order processors (PPro errata #51) - */ - -#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) - -static inline void flush_write_buffers(void) -{ - asm volatile("lock; addl $0,0(%%esp)": : :"memory"); -} - -#else - -#define flush_write_buffers() do { } while (0) - -#endif - -#endif /* __KERNEL__ */ - -extern void native_io_delay(void); - -extern int io_delay_type; -extern void io_delay_init(void); - -#if defined(CONFIG_PARAVIRT) -#include <asm/paravirt.h> -#else - -static inline void slow_down_io(void) -{ - native_io_delay(); -#ifdef REALLY_SLOW_IO - native_io_delay(); - native_io_delay(); - native_io_delay(); -#endif -} - -#endif - -#define __BUILDIO(bwl, bw, type) \ -static inline void out##bwl(unsigned type value, int port) \ -{ \ - out##bwl##_local(value, port); \ -} \ - \ -static inline unsigned type in##bwl(int port) \ -{ \ - return in##bwl##_local(port); \ -} - -#define BUILDIO(bwl, bw, type) \ -static inline void out##bwl##_local(unsigned type value, int port) \ -{ \ - asm volatile("out" #bwl " %" #bw "0, %w1" \ - : : "a"(value), "Nd"(port)); \ -} \ - \ -static inline unsigned type in##bwl##_local(int port) \ -{ \ - unsigned type value; \ - asm volatile("in" #bwl " %w1, %" #bw "0" \ - : "=a"(value) : "Nd"(port)); \ - return value; \ -} \ - \ -static inline void out##bwl##_local_p(unsigned type value, int port) \ -{ \ - out##bwl##_local(value, port); \ - slow_down_io(); \ -} \ - \ -static inline unsigned type in##bwl##_local_p(int port) \ -{ \ - unsigned type value = in##bwl##_local(port); \ - slow_down_io(); \ - return value; \ -} \ - \ -__BUILDIO(bwl, bw, type) \ - \ -static inline void out##bwl##_p(unsigned type value, int port) \ -{ \ - out##bwl(value, port); \ - slow_down_io(); \ -} \ - \ -static inline unsigned type in##bwl##_p(int port) \ -{ \ - unsigned type value = in##bwl(port); \ - slow_down_io(); \ - return value; \ -} \ - \ -static inline void outs##bwl(int port, const void *addr, unsigned long count) \ -{ \ - asm volatile("rep; outs" #bwl \ - : "+S"(addr), "+c"(count) : "d"(port)); \ -} \ - \ -static inline void ins##bwl(int port, void *addr, unsigned long count) \ -{ \ - asm volatile("rep; ins" #bwl \ - : "+D"(addr), "+c"(count) : "d"(port)); \ -} - -BUILDIO(b, b, char) -BUILDIO(w, w, short) -BUILDIO(l, , int) - -#endif /* _ASM_X86_IO_32_H */ diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h deleted file mode 100644 index 244067893af..00000000000 --- a/arch/x86/include/asm/io_64.h +++ /dev/null @@ -1,181 +0,0 @@ -#ifndef _ASM_X86_IO_64_H -#define _ASM_X86_IO_64_H - - -/* - * This file contains the definitions for the x86 IO instructions - * inb/inw/inl/outb/outw/outl and the "string versions" of the same - * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" - * versions of the single-IO instructions (inb_p/inw_p/..). - * - * This file is not meant to be obfuscating: it's just complicated - * to (a) handle it all in a way that makes gcc able to optimize it - * as well as possible and (b) trying to avoid writing the same thing - * over and over again with slight variations and possibly making a - * mistake somewhere. - */ - -/* - * Thanks to James van Artsdalen for a better timing-fix than - * the two short jumps: using outb's to a nonexistent port seems - * to guarantee better timings even on fast machines. - * - * On the other hand, I'd like to be sure of a non-existent port: - * I feel a bit unsafe about using 0x80 (should be safe, though) - * - * Linus - */ - - /* - * Bit simplified and optimized by Jan Hubicka - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. - * - * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, - * isa_read[wl] and isa_write[wl] fixed - * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -extern void native_io_delay(void); - -extern int io_delay_type; -extern void io_delay_init(void); - -#if defined(CONFIG_PARAVIRT) -#include <asm/paravirt.h> -#else - -static inline void slow_down_io(void) -{ - native_io_delay(); -#ifdef REALLY_SLOW_IO - native_io_delay(); - native_io_delay(); - native_io_delay(); -#endif -} -#endif - -/* - * Talk about misusing macros.. - */ -#define __OUT1(s, x) \ -static inline void out##s(unsigned x value, unsigned short port) { - -#define __OUT2(s, s1, s2) \ -asm volatile ("out" #s " %" s1 "0,%" s2 "1" - -#ifndef REALLY_SLOW_IO -#define REALLY_SLOW_IO -#define UNSET_REALLY_SLOW_IO -#endif - -#define __OUT(s, s1, x) \ - __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \ - } \ - __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \ - slow_down_io(); \ -} - -#define __IN1(s) \ -static inline RETURN_TYPE in##s(unsigned short port) \ -{ \ - RETURN_TYPE _v; - -#define __IN2(s, s1, s2) \ - asm volatile ("in" #s " %" s2 "1,%" s1 "0" - -#define __IN(s, s1, i...) \ - __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \ - return _v; \ - } \ - __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \ - slow_down_io(); \ - return _v; } - -#ifdef UNSET_REALLY_SLOW_IO -#undef REALLY_SLOW_IO -#endif - -#define __INS(s) \ -static inline void ins##s(unsigned short port, void *addr, \ - unsigned long count) \ -{ \ - asm volatile ("rep ; ins" #s \ - : "=D" (addr), "=c" (count) \ - : "d" (port), "0" (addr), "1" (count)); \ -} - -#define __OUTS(s) \ -static inline void outs##s(unsigned short port, const void *addr, \ - unsigned long count) \ -{ \ - asm volatile ("rep ; outs" #s \ - : "=S" (addr), "=c" (count) \ - : "d" (port), "0" (addr), "1" (count)); \ -} - -#define RETURN_TYPE unsigned char -__IN(b, "") -#undef RETURN_TYPE -#define RETURN_TYPE unsigned short -__IN(w, "") -#undef RETURN_TYPE -#define RETURN_TYPE unsigned int -__IN(l, "") -#undef RETURN_TYPE - -__OUT(b, "b", char) -__OUT(w, "w", short) -__OUT(l, , int) - -__INS(b) -__INS(w) -__INS(l) - -__OUTS(b) -__OUTS(w) -__OUTS(l) - -#if defined(__KERNEL__) && defined(__x86_64__) - -#include <linux/vmalloc.h> - -#include <asm-generic/iomap.h> - -void __memcpy_fromio(void *, unsigned long, unsigned); -void __memcpy_toio(unsigned long, const void *, unsigned); - -static inline void memcpy_fromio(void *to, const volatile void __iomem *from, - unsigned len) -{ - __memcpy_fromio(to, (unsigned long)from, len); -} - -static inline void memcpy_toio(volatile void __iomem *to, const void *from, - unsigned len) -{ - __memcpy_toio((unsigned long)to, from, len); -} - -void memset_io(volatile void __iomem *a, int b, size_t c); - -/* - * ISA space is 'always mapped' on a typical x86 system, no need to - * explicitly ioremap() it. The fact that the ISA IO space is mapped - * to PAGE_OFFSET is pure coincidence - it does not mean ISA values - * are physical addresses. The following constant pointer can be - * used as the IO-area pointer (it can be iounmapped as well, so the - * analogy with PCI is quite large): - */ -#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) - -#define flush_write_buffers() - -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - -#endif /* __KERNEL__ */ - -#endif /* _ASM_X86_IO_64_H */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 7c7c16cde1f..35832a03a51 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -143,8 +143,6 @@ extern int noioapicreroute; /* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ extern int timer_through_8259; -extern void io_apic_disable_legacy(void); - /* * If we use the IO-APIC for IRQ routing, disable automatic * assignment of PCI IRQ's. @@ -160,6 +158,7 @@ extern int io_apic_get_redir_entries(int ioapic); struct io_apic_irq_attr; extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); +void setup_IO_APIC_irq_extra(u32 gsi); extern int (*ioapic_renumber_irq)(int ioapic, int irq); extern void ioapic_init_mappings(void); extern void ioapic_insert_resources(void); @@ -188,6 +187,7 @@ extern struct mp_ioapic_gsi mp_gsi_routing[]; int mp_find_ioapic(int gsi); int mp_find_ioapic_pin(int ioapic, int gsi); void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); +extern void __init pre_init_apic_IRQ0(void); #else /* !CONFIG_X86_IO_APIC */ @@ -197,7 +197,11 @@ static const int timer_through_8259 = 0; static inline void ioapic_init_mappings(void) { } static inline void ioapic_insert_resources(void) { } static inline void probe_nr_irqs_gsi(void) { } +static inline int mp_find_ioapic(int gsi) { return 0; } +struct io_apic_irq_attr; +static inline int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) { return 0; } #endif #endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 4611f085cd4..8767d99c4f6 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -28,28 +28,33 @@ #define MCE_VECTOR 0x12 /* - * IDT vectors usable for external interrupt sources start - * at 0x20: + * IDT vectors usable for external interrupt sources start at 0x20. + * (0x80 is the syscall vector, 0x30-0x3f are for ISA) */ #define FIRST_EXTERNAL_VECTOR 0x20 - -#ifdef CONFIG_X86_32 -# define SYSCALL_VECTOR 0x80 -# define IA32_SYSCALL_VECTOR 0x80 -#else -# define IA32_SYSCALL_VECTOR 0x80 -#endif +/* + * We start allocating at 0x21 to spread out vectors evenly between + * priority levels. (0x80 is the syscall vector) + */ +#define VECTOR_OFFSET_START 1 /* - * Reserve the lowest usable priority level 0x20 - 0x2f for triggering - * cleanup after irq migration. + * Reserve the lowest usable vector (and hence lowest priority) 0x20 for + * triggering cleanup after irq migration. 0x21-0x2f will still be used + * for device interrupts. */ #define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR +#define IA32_SYSCALL_VECTOR 0x80 +#ifdef CONFIG_X86_32 +# define SYSCALL_VECTOR 0x80 +#endif + /* * Vectors 0x30-0x3f are used for ISA interrupts. + * round up to the next 16-vector boundary */ -#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) +#define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15) #define IRQ1_VECTOR (IRQ0_VECTOR + 1) #define IRQ2_VECTOR (IRQ0_VECTOR + 2) @@ -120,13 +125,6 @@ */ #define MCE_SELF_VECTOR 0xeb -/* - * First APIC vector available to drivers: (vectors 0x30-0xee) we - * start at 0x31(0x41) to spread out vectors evenly between priority - * levels. (0x80 is the syscall vector) - */ -#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) - #define NR_VECTORS 256 #define FPU_IRQ 13 @@ -154,21 +152,21 @@ static inline int invalid_vm86_irq(int irq) #define NR_IRQS_LEGACY 16 -#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS ) #define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) #ifdef CONFIG_X86_IO_APIC # ifdef CONFIG_SPARSE_IRQ +# define CPU_VECTOR_LIMIT (64 * NR_CPUS) # define NR_IRQS \ (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ (NR_VECTORS + CPU_VECTOR_LIMIT) : \ (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) # else -# if NR_CPUS < MAX_IO_APICS -# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT) -# else -# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) -# endif +# define CPU_VECTOR_LIMIT (32 * NR_CPUS) +# define NR_IRQS \ + (CPU_VECTOR_LIMIT < IO_APIC_VECTOR_LIMIT ? \ + (NR_VECTORS + CPU_VECTOR_LIMIT) : \ + (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) # endif #else /* !CONFIG_X86_IO_APIC: */ # define NR_IRQS NR_IRQS_LEGACY diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 4fe681de1e7..4ffa345a8cc 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -32,7 +32,10 @@ struct kprobe; typedef u8 kprobe_opcode_t; #define BREAKPOINT_INSTRUCTION 0xcc -#define RELATIVEJUMP_INSTRUCTION 0xe9 +#define RELATIVEJUMP_OPCODE 0xe9 +#define RELATIVEJUMP_SIZE 5 +#define RELATIVECALL_OPCODE 0xe8 +#define RELATIVE_ADDR_SIZE 4 #define MAX_INSN_SIZE 16 #define MAX_STACK_SIZE 64 #define MIN_STACK_SIZE(ADDR) \ @@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t; #define flush_insn_slot(p) do { } while (0) +/* optinsn template addresses */ +extern kprobe_opcode_t optprobe_template_entry; +extern kprobe_opcode_t optprobe_template_val; +extern kprobe_opcode_t optprobe_template_call; +extern kprobe_opcode_t optprobe_template_end; +#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) +#define MAX_OPTINSN_SIZE \ + (((unsigned long)&optprobe_template_end - \ + (unsigned long)&optprobe_template_entry) + \ + MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE) + extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); @@ -64,6 +78,21 @@ struct arch_specific_insn { int boostable; }; +struct arch_optimized_insn { + /* copy of the original instructions */ + kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE]; + /* detour code buffer */ + kprobe_opcode_t *insn; + /* the size of instructions copied to detour code buffer */ + size_t size; +}; + +/* Return true (!0) if optinsn is prepared for optimization. */ +static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn) +{ + return optinsn->size; +} + struct prev_kprobe { struct kprobe *kp; unsigned long status; diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 950df434763..f46b79f6c16 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -254,6 +254,10 @@ struct kvm_reinject_control { __u8 reserved[31]; }; +/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */ +#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 +#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 + /* for KVM_GET/SET_VCPU_EVENTS */ struct kvm_vcpu_events { struct { diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 7c18e1230f5..7a6f54fa13b 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -54,13 +54,23 @@ struct x86_emulate_ctxt; struct x86_emulate_ops { /* * read_std: Read bytes of standard (non-emulated/special) memory. - * Used for instruction fetch, stack operations, and others. + * Used for descriptor reading. * @addr: [IN ] Linear address from which to read. * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu); + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); + + /* + * fetch: Read bytes of standard (non-emulated/special) memory. + * Used for instruction fetch. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*fetch)(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); /* * read_emulated: Read bytes from emulated/special memory area. @@ -74,7 +84,7 @@ struct x86_emulate_ops { struct kvm_vcpu *vcpu); /* - * write_emulated: Read bytes from emulated/special memory area. + * write_emulated: Write bytes to emulated/special memory area. * @addr: [IN ] Linear address to which to write. * @val: [IN ] Value to write to memory (low-order bytes used as * required). @@ -168,6 +178,7 @@ struct x86_emulate_ctxt { /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ +#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */ #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4f865e8b854..06d9e79ca37 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -25,7 +25,7 @@ #include <asm/mtrr.h> #include <asm/msr-index.h> -#define KVM_MAX_VCPUS 16 +#define KVM_MAX_VCPUS 64 #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 @@ -38,19 +38,6 @@ #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) -#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) -#define KVM_GUEST_CR0_MASK \ - (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) -#define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_GUEST_CR4_MASK \ - (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) -#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) -#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) - #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) @@ -256,7 +243,8 @@ struct kvm_mmu { void (*new_cr3)(struct kvm_vcpu *vcpu); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); void (*free)(struct kvm_vcpu *vcpu); - gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, + u32 *error); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, @@ -282,13 +270,15 @@ struct kvm_vcpu_arch { u32 regs_dirty; unsigned long cr0; + unsigned long cr0_guest_owned_bits; unsigned long cr2; unsigned long cr3; unsigned long cr4; + unsigned long cr4_guest_owned_bits; unsigned long cr8; u32 hflags; u64 pdptrs[4]; /* pae */ - u64 shadow_efer; + u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ int32_t apic_arb_prio; @@ -374,17 +364,27 @@ struct kvm_vcpu_arch { /* used for guest single stepping over the given code position */ u16 singlestep_cs; unsigned long singlestep_rip; + /* fields used by HYPER-V emulation */ + u64 hv_vapic; }; struct kvm_mem_alias { gfn_t base_gfn; unsigned long npages; gfn_t target_gfn; +#define KVM_ALIAS_INVALID 1UL + unsigned long flags; }; -struct kvm_arch{ - int naliases; +#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION + +struct kvm_mem_aliases { struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; + int naliases; +}; + +struct kvm_arch { + struct kvm_mem_aliases *aliases; unsigned int n_free_mmu_pages; unsigned int n_requested_mmu_pages; @@ -416,6 +416,10 @@ struct kvm_arch{ s64 kvmclock_offset; struct kvm_xen_hvm_config xen_hvm_config; + + /* fields used by HYPER-V emulation */ + u64 hv_guest_os_id; + u64 hv_hypercall; }; struct kvm_vm_stat { @@ -471,6 +475,7 @@ struct kvm_x86_ops { int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ bool (*cpu_has_accelerated_tpr)(void); + void (*cpuid_update)(struct kvm_vcpu *vcpu); /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); @@ -492,6 +497,7 @@ struct kvm_x86_ops { void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); + void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -501,12 +507,13 @@ struct kvm_x86_ops { void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); - void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, - int *exception); + int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest); + int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + void (*fpu_activate)(struct kvm_vcpu *vcpu); + void (*fpu_deactivate)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); @@ -531,7 +538,8 @@ struct kvm_x86_ops { int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); - bool (*gb_page_enable)(void); + int (*get_lpage_level)(void); + bool (*rdtscp_supported)(void); const struct trace_print_flags *exit_reasons_str; }; @@ -606,8 +614,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg); +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); @@ -653,6 +660,10 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); @@ -666,6 +677,7 @@ void kvm_disable_tdp(void); int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int complete_pio(struct kvm_vcpu *vcpu); +bool kvm_check_iopl(struct kvm_vcpu *vcpu); struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index c584076a47f..ffae1420e7d 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -2,6 +2,7 @@ #define _ASM_X86_KVM_PARA_H #include <linux/types.h> +#include <asm/hyperv.h> /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It * should be used to determine that a VM is running under KVM. diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index ba0eed8aa1a..b60f2924c41 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -28,22 +28,39 @@ #ifndef __ASSEMBLY__ #include <asm/hw_irq.h> -#include <asm/kvm_para.h> /*G:030 * But first, how does our Guest contact the Host to ask for privileged * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * - * We use the KVM hypercall mechanism, though completely different hypercall - * numbers. Seventeen hypercalls are available: the hypercall number is put in - * the %eax register, and the arguments (when required) are placed in %ebx, - * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. + * Our hypercall mechanism uses the highest unused trap code (traps 32 and + * above are used by real hardware interrupts). Seventeen hypercalls are + * available: the hypercall number is put in the %eax register, and the + * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. + * If a return value makes sense, it's returned in %eax. * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's * definition of a gentleman: "someone who is only rude intentionally". -:*/ + */ +static inline unsigned long +hcall(unsigned long call, + unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4) +{ + /* "int" is the Intel instruction to trigger a trap. */ + asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) + /* The call in %eax (aka "a") might be overwritten */ + : "=a"(call) + /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */ + : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4) + /* "memory" means this might write somewhere in memory. + * This isn't true for all calls, but it's safe to tell + * gcc that it might happen so it doesn't get clever. */ + : "memory"); + return call; +} /* Can't use our min() macro here: needs to be a constant */ #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 47b9b6f1905..2e9972468a5 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -195,41 +195,4 @@ static inline long local_sub_return(long i, local_t *l) #define __local_add(i, l) local_add((i), (l)) #define __local_sub(i, l) local_sub((i), (l)) -/* Use these for per-cpu local_t variables: on some archs they are - * much more efficient than these naive implementations. Note they take - * a variable, not an address. - * - * X86_64: This could be done better if we moved the per cpu data directly - * after GS. - */ - -/* Need to disable preemption for the cpu local counters otherwise we could - still access a variable of a previous CPU in a non atomic way. */ -#define cpu_local_wrap_v(l) \ -({ \ - local_t res__; \ - preempt_disable(); \ - res__ = (l); \ - preempt_enable(); \ - res__; \ -}) -#define cpu_local_wrap(l) \ -({ \ - preempt_disable(); \ - (l); \ - preempt_enable(); \ -}) \ - -#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var((l)))) -#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var((l)), (i))) -#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var((l)))) -#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var((l)))) -#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var((l)))) -#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var((l)))) - -#define __cpu_local_inc(l) cpu_local_inc((l)) -#define __cpu_local_dec(l) cpu_local_dec((l)) -#define __cpu_local_add(i, l) cpu_local_add((i), (l)) -#define __cpu_local_sub(i, l) cpu_local_sub((i), (l)) - #endif /* _ASM_X86_LOCAL_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 858baa061cf..6c3fdd631ed 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -108,10 +108,11 @@ struct mce_log { #define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) #define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) -extern struct atomic_notifier_head x86_mce_decoder_chain; #ifdef __KERNEL__ +extern struct atomic_notifier_head x86_mce_decoder_chain; + #include <linux/percpu.h> #include <linux/init.h> #include <asm/atomic.h> diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index c24ca9a5645..ef51b501e22 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -12,8 +12,6 @@ struct device; enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; struct microcode_ops { - void (*init)(struct device *device); - void (*fini)(void); enum ucode_state (*request_microcode_user) (int cpu, const void __user *buf, size_t size); diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index a29f48c2a32..288b96f815a 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h @@ -39,11 +39,5 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ NODE_DATA(nid)->node_spanned_pages) - -#ifdef CONFIG_NUMA_EMU -#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024) -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -#endif - #endif #endif /* _ASM_X86_MMZONE_64_H */ diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h new file mode 100644 index 00000000000..451d30e7f62 --- /dev/null +++ b/arch/x86/include/asm/mrst.h @@ -0,0 +1,19 @@ +/* + * mrst.h: Intel Moorestown platform specific setup code + * + * (C) Copyright 2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#ifndef _ASM_X86_MRST_H +#define _ASM_X86_MRST_H +extern int pci_mrst_init(void); +int __init sfi_parse_mrtc(struct sfi_table_header *table); + +#define SFI_MTMR_MAX_NUM 8 +#define SFI_MRTC_MAX 8 + +#endif /* _ASM_X86_MRST_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1cd58cdbc03..4604e6a54d3 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -105,6 +105,8 @@ #define MSR_AMD64_PATCH_LEVEL 0x0000008b #define MSR_AMD64_NB_CFG 0xc001001f #define MSR_AMD64_PATCH_LOADER 0xc0010020 +#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 +#define MSR_AMD64_OSVW_STATUS 0xc0010141 #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 139d4c1a33a..93da9c3f334 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -19,7 +19,6 @@ extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); extern int nmi_watchdog_enabled; extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); -extern int avail_to_resrv_perfctr_nmi(unsigned int); extern int reserve_perfctr_nmi(unsigned int); extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index c4ae822e415..823e070e7c2 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -36,6 +36,11 @@ extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); extern void __cpuinit numa_add_cpu(int cpu); extern void __cpuinit numa_remove_cpu(int cpu); + +#ifdef CONFIG_NUMA_EMU +#define FAKE_NODE_MIN_SIZE ((u64)64 << 20) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) +#endif /* CONFIG_NUMA_EMU */ #else static inline void init_cpu_to_node(void) { } static inline void numa_set_node(int cpu, int node) { } diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h index 9f0a5f5d29e..37c516545ec 100644 --- a/arch/x86/include/asm/numaq.h +++ b/arch/x86/include/asm/numaq.h @@ -30,9 +30,14 @@ extern int found_numaq; extern int get_memcfg_numaq(void); +extern int pci_numaq_init(void); extern void *xquad_portio; +#define XQUAD_PORTIO_BASE 0xfe400000 +#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ +#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) + /* * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the */ diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 3a57385d9fa..101229b0d8e 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -13,7 +13,6 @@ struct olpc_platform_t { #define OLPC_F_PRESENT 0x01 #define OLPC_F_DCON 0x02 -#define OLPC_F_VSA 0x04 #ifdef CONFIG_OLPC @@ -51,18 +50,6 @@ static inline int olpc_has_dcon(void) } /* - * The VSA is software from AMD that typical Geode bioses will include. - * It is used to emulate the PCI bus, VGA, etc. OLPC's Open Firmware does - * not include the VSA; instead, PCI is emulated by the kernel. - * - * The VSA is described further in arch/x86/pci/olpc.c. - */ -static inline int olpc_has_vsa(void) -{ - return (olpc_platform_info.flags & OLPC_F_VSA) ? 1 : 0; -} - -/* * The "Mass Production" version of OLPC's XO is identified as being model * C2. During the prototype phase, the following models (in chronological * order) were created: A1, B1, B2, B3, B4, C1. The A1 through B2 models @@ -87,13 +74,10 @@ static inline int olpc_has_dcon(void) return 0; } -static inline int olpc_has_vsa(void) -{ - return 0; -} - #endif +extern int pci_olpc_init(void); + /* EC related functions */ extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 642fe34b36a..a667f24c725 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -40,7 +40,6 @@ #ifndef __ASSEMBLY__ -extern int page_is_ram(unsigned long pagenr); extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index dd59a85a918..5653f43d90e 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -435,15 +435,6 @@ static inline void paravirt_release_pud(unsigned long pfn) PVOP_VCALL1(pv_mmu_ops.release_pud, pfn); } -#ifdef CONFIG_HIGHPTE -static inline void *kmap_atomic_pte(struct page *page, enum km_type type) -{ - unsigned long ret; - ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type); - return (void *)ret; -} -#endif - static inline void pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b1e70d51e40..db9ef553234 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -304,10 +304,6 @@ struct pv_mmu_ops { #endif /* PAGETABLE_LEVELS == 4 */ #endif /* PAGETABLE_LEVELS >= 3 */ -#ifdef CONFIG_HIGHPTE - void *(*kmap_atomic_pte)(struct page *page, enum km_type type); -#endif - struct pv_lazy_ops lazy_mode; /* dom0 ops */ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index ada8c201d51..404a880ea32 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -45,8 +45,15 @@ static inline int pci_proc_domain(struct pci_bus *bus) #ifdef CONFIG_PCI extern unsigned int pcibios_assign_all_busses(void); +extern int pci_legacy_init(void); +# ifdef CONFIG_ACPI +# define x86_default_pci_init pci_acpi_init +# else +# define x86_default_pci_init pci_legacy_init +# endif #else -#define pcibios_assign_all_busses() 0 +# define pcibios_assign_all_busses() 0 +# define x86_default_pci_init NULL #endif extern unsigned long pci_mem_start; @@ -90,40 +97,14 @@ extern void pci_iommu_alloc(void); #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) -#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG) - -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ - dma_addr_t ADDR_NAME; -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \ - __u32 LEN_NAME; -#define pci_unmap_addr(PTR, ADDR_NAME) \ - ((PTR)->ADDR_NAME) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ - (((PTR)->ADDR_NAME) = (VAL)) -#define pci_unmap_len(PTR, LEN_NAME) \ - ((PTR)->LEN_NAME) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ - (((PTR)->LEN_NAME) = (VAL)) - -#else - -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0]; -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0]; -#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ - do { break; } while (pci_unmap_addr(PTR, ADDR_NAME)) -#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ - do { break; } while (pci_unmap_len(PTR, LEN_NAME)) - -#endif - #endif /* __KERNEL__ */ #ifdef CONFIG_X86_64 #include "pci_64.h" #endif +void dma32_reserve_bootmem(void); + /* implement the pci_ DMA API in terms of the generic device dma_ one */ #include <asm-generic/pci-dma-compat.h> diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h index ae5e40f67da..fe15cfb21b9 100644 --- a/arch/x86/include/asm/pci_64.h +++ b/arch/x86/include/asm/pci_64.h @@ -22,8 +22,6 @@ extern int (*pci_config_read)(int seg, int bus, int dev, int fn, extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value); -extern void dma32_reserve_bootmem(void); - #endif /* __KERNEL__ */ #endif /* _ASM_X86_PCI_64_H */ diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index b4bf9a942ed..1a0422348d6 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -29,6 +29,7 @@ #define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000 #define PCI_HAS_IO_ECS 0x40000 #define PCI_NOASSIGN_ROMS 0x80000 +#define PCI_ROOT_NO_CRS 0x100000 extern unsigned int pci_probe; extern unsigned long pirq_table_addr; @@ -82,7 +83,6 @@ struct irq_routing_table { extern unsigned int pcibios_irq_mask; -extern int pcibios_scanned; extern spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); @@ -105,16 +105,15 @@ extern bool port_cf9_safe; extern int pci_direct_probe(void); extern void pci_direct_init(int type); extern void pci_pcbios_init(void); -extern int pci_olpc_init(void); extern void __init dmi_check_pciprobe(void); extern void __init dmi_check_skip_isa_align(void); /* some common used subsys_initcalls */ extern int __init pci_acpi_init(void); -extern int __init pcibios_irq_init(void); -extern int __init pci_visws_init(void); -extern int __init pci_numaq_init(void); +extern void __init pcibios_irq_init(void); extern int __init pcibios_init(void); +extern int pci_legacy_init(void); +extern void pcibios_fixup_irqs(void); /* pci-mmconfig.c */ @@ -182,3 +181,17 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val) { asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory"); } + +#ifdef CONFIG_PCI +# ifdef CONFIG_ACPI +# define x86_default_pci_init pci_acpi_init +# else +# define x86_default_pci_init pci_legacy_init +# endif +# define x86_default_pci_init_irq pcibios_irq_init +# define x86_default_pci_fixup_irqs pcibios_fixup_irqs +#else +# define x86_default_pci_init NULL +# define x86_default_pci_init_irq NULL +# define x86_default_pci_fixup_irqs NULL +#endif diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 0c44196b78a..0ec6d12d84e 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -25,19 +25,18 @@ */ #ifdef CONFIG_SMP #define PER_CPU(var, reg) \ - __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg; \ - lea per_cpu__##var(reg), reg -#define PER_CPU_VAR(var) %__percpu_seg:per_cpu__##var + __percpu_mov_op %__percpu_seg:this_cpu_off, reg; \ + lea var(reg), reg +#define PER_CPU_VAR(var) %__percpu_seg:var #else /* ! SMP */ -#define PER_CPU(var, reg) \ - __percpu_mov_op $per_cpu__##var, reg -#define PER_CPU_VAR(var) per_cpu__##var +#define PER_CPU(var, reg) __percpu_mov_op $var, reg +#define PER_CPU_VAR(var) var #endif /* SMP */ #ifdef CONFIG_X86_64_SMP #define INIT_PER_CPU_VAR(var) init_per_cpu__##var #else -#define INIT_PER_CPU_VAR(var) per_cpu__##var +#define INIT_PER_CPU_VAR(var) var #endif #else /* ...!ASSEMBLY */ @@ -60,12 +59,12 @@ * There also must be an entry in vmlinux_64.lds.S */ #define DECLARE_INIT_PER_CPU(var) \ - extern typeof(per_cpu_var(var)) init_per_cpu_var(var) + extern typeof(var) init_per_cpu_var(var) #ifdef CONFIG_X86_64_SMP #define init_per_cpu_var(var) init_per_cpu__##var #else -#define init_per_cpu_var(var) per_cpu_var(var) +#define init_per_cpu_var(var) var #endif /* For arch-specific code, we can use direct single-insn ops (they @@ -104,6 +103,64 @@ do { \ } \ } while (0) +/* + * Generate a percpu add to memory instruction and optimize code + * if a one is added or subtracted. + */ +#define percpu_add_op(var, val) \ +do { \ + typedef typeof(var) pao_T__; \ + const int pao_ID__ = (__builtin_constant_p(val) && \ + ((val) == 1 || (val) == -1)) ? (val) : 0; \ + if (0) { \ + pao_T__ pao_tmp__; \ + pao_tmp__ = (val); \ + } \ + switch (sizeof(var)) { \ + case 1: \ + if (pao_ID__ == 1) \ + asm("incb "__percpu_arg(0) : "+m" (var)); \ + else if (pao_ID__ == -1) \ + asm("decb "__percpu_arg(0) : "+m" (var)); \ + else \ + asm("addb %1, "__percpu_arg(0) \ + : "+m" (var) \ + : "qi" ((pao_T__)(val))); \ + break; \ + case 2: \ + if (pao_ID__ == 1) \ + asm("incw "__percpu_arg(0) : "+m" (var)); \ + else if (pao_ID__ == -1) \ + asm("decw "__percpu_arg(0) : "+m" (var)); \ + else \ + asm("addw %1, "__percpu_arg(0) \ + : "+m" (var) \ + : "ri" ((pao_T__)(val))); \ + break; \ + case 4: \ + if (pao_ID__ == 1) \ + asm("incl "__percpu_arg(0) : "+m" (var)); \ + else if (pao_ID__ == -1) \ + asm("decl "__percpu_arg(0) : "+m" (var)); \ + else \ + asm("addl %1, "__percpu_arg(0) \ + : "+m" (var) \ + : "ri" ((pao_T__)(val))); \ + break; \ + case 8: \ + if (pao_ID__ == 1) \ + asm("incq "__percpu_arg(0) : "+m" (var)); \ + else if (pao_ID__ == -1) \ + asm("decq "__percpu_arg(0) : "+m" (var)); \ + else \ + asm("addq %1, "__percpu_arg(0) \ + : "+m" (var) \ + : "re" ((pao_T__)(val))); \ + break; \ + default: __bad_percpu_size(); \ + } \ +} while (0) + #define percpu_from_op(op, var, constraint) \ ({ \ typeof(var) pfo_ret__; \ @@ -133,6 +190,29 @@ do { \ pfo_ret__; \ }) +#define percpu_unary_op(op, var) \ +({ \ + switch (sizeof(var)) { \ + case 1: \ + asm(op "b "__percpu_arg(0) \ + : "+m" (var)); \ + break; \ + case 2: \ + asm(op "w "__percpu_arg(0) \ + : "+m" (var)); \ + break; \ + case 4: \ + asm(op "l "__percpu_arg(0) \ + : "+m" (var)); \ + break; \ + case 8: \ + asm(op "q "__percpu_arg(0) \ + : "+m" (var)); \ + break; \ + default: __bad_percpu_size(); \ + } \ +}) + /* * percpu_read() makes gcc load the percpu variable every time it is * accessed while percpu_read_stable() allows the value to be cached. @@ -142,16 +222,15 @@ do { \ * per-thread variables implemented as per-cpu variables and thus * stable for the duration of the respective task. */ -#define percpu_read(var) percpu_from_op("mov", per_cpu__##var, \ - "m" (per_cpu__##var)) -#define percpu_read_stable(var) percpu_from_op("mov", per_cpu__##var, \ - "p" (&per_cpu__##var)) -#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val) -#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val) -#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val) -#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val) -#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val) -#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val) +#define percpu_read(var) percpu_from_op("mov", var, "m" (var)) +#define percpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) +#define percpu_write(var, val) percpu_to_op("mov", var, val) +#define percpu_add(var, val) percpu_add_op(var, val) +#define percpu_sub(var, val) percpu_add_op(var, -(val)) +#define percpu_and(var, val) percpu_to_op("and", var, val) +#define percpu_or(var, val) percpu_to_op("or", var, val) +#define percpu_xor(var, val) percpu_to_op("xor", var, val) +#define percpu_inc(var) percpu_unary_op("inc", var) #define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) @@ -160,9 +239,9 @@ do { \ #define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) #define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) #define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) -#define __this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) -#define __this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) +#define __this_cpu_add_1(pcp, val) percpu_add_op((pcp), val) +#define __this_cpu_add_2(pcp, val) percpu_add_op((pcp), val) +#define __this_cpu_add_4(pcp, val) percpu_add_op((pcp), val) #define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) #define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) #define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) @@ -179,9 +258,9 @@ do { \ #define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) #define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) #define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) -#define this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) -#define this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) +#define this_cpu_add_1(pcp, val) percpu_add_op((pcp), val) +#define this_cpu_add_2(pcp, val) percpu_add_op((pcp), val) +#define this_cpu_add_4(pcp, val) percpu_add_op((pcp), val) #define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) #define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) #define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) @@ -192,9 +271,9 @@ do { \ #define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) #define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) -#define irqsafe_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) -#define irqsafe_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) -#define irqsafe_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) +#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val) +#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val) +#define irqsafe_cpu_add_4(pcp, val) percpu_add_op((pcp), val) #define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) #define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) #define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) @@ -212,19 +291,19 @@ do { \ #ifdef CONFIG_X86_64 #define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) +#define __this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) #define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) #define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) #define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) #define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) +#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) #define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) #define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) #define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) -#define irqsafe_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) +#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val) #define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) #define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) #define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) @@ -236,7 +315,7 @@ do { \ ({ \ int old__; \ asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \ - : "=r" (old__), "+m" (per_cpu__##var) \ + : "=r" (old__), "+m" (var) \ : "dIr" (bit)); \ old__; \ }) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8d9f8548a87..db6109a885a 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -18,7 +18,8 @@ #define MSR_ARCH_PERFMON_EVENTSEL0 0x186 #define MSR_ARCH_PERFMON_EVENTSEL1 0x187 -#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_ANY (1 << 21) #define ARCH_PERFMON_EVENTSEL_INT (1 << 20) #define ARCH_PERFMON_EVENTSEL_OS (1 << 17) #define ARCH_PERFMON_EVENTSEL_USR (1 << 16) @@ -26,7 +27,14 @@ /* * Includes eventsel and unit mask as well: */ -#define ARCH_PERFMON_EVENT_MASK 0xffff + + +#define INTEL_ARCH_EVTSEL_MASK 0x000000FFULL +#define INTEL_ARCH_UNIT_MASK 0x0000FF00ULL +#define INTEL_ARCH_EDGE_MASK 0x00040000ULL +#define INTEL_ARCH_INV_MASK 0x00800000ULL +#define INTEL_ARCH_CNT_MASK 0xFF000000ULL +#define INTEL_ARCH_EVENT_MASK (INTEL_ARCH_UNIT_MASK|INTEL_ARCH_EVTSEL_MASK) /* * filter mask to validate fixed counter events. @@ -37,7 +45,12 @@ * The other filters are supported by fixed counters. * The any-thread option is supported starting with v3. */ -#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000 +#define INTEL_ARCH_FIXED_MASK \ + (INTEL_ARCH_CNT_MASK| \ + INTEL_ARCH_INV_MASK| \ + INTEL_ARCH_EDGE_MASK|\ + INTEL_ARCH_UNIT_MASK|\ + INTEL_ARCH_EVENT_MASK) #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) @@ -104,6 +117,18 @@ union cpuid10_edx { */ #define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) +/* IbsFetchCtl bits/masks */ +#define IBS_FETCH_RAND_EN (1ULL<<57) +#define IBS_FETCH_VAL (1ULL<<49) +#define IBS_FETCH_ENABLE (1ULL<<48) +#define IBS_FETCH_CNT 0xFFFF0000ULL +#define IBS_FETCH_MAX_CNT 0x0000FFFFULL + +/* IbsOpCtl bits */ +#define IBS_OP_CNT_CTL (1ULL<<19) +#define IBS_OP_VAL (1ULL<<18) +#define IBS_OP_ENABLE (1ULL<<17) +#define IBS_OP_MAX_CNT 0x0000FFFFULL #ifdef CONFIG_PERF_EVENTS extern void init_hw_perf_events(void); diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 0e8c2a0fd92..271de94c381 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -23,6 +23,11 @@ static inline void paravirt_release_pud(unsigned long pfn) {} #endif /* + * Flags to use when allocating a user page table page. + */ +extern gfp_t __userpte_alloc_gfp; + +/* * Allocate and free page tables. */ extern pgd_t *pgd_alloc(struct mm_struct *); diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 01fd9461d32..2984a25ff38 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -19,7 +19,6 @@ #include <asm/paravirt.h> #include <linux/bitops.h> -#include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> @@ -54,10 +53,10 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); in_irq() ? KM_IRQ_PTE : \ KM_PTE0) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \ pte_index((address))) #define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \ pte_index((address))) #define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) @@ -80,7 +79,7 @@ do { \ * The i386 doesn't have any external MMU info: the kernel page * tables contain all the necessary information. */ -#define update_mmu_cache(vma, address, pte) do { } while (0) +#define update_mmu_cache(vma, address, ptep) do { } while (0) #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index c57a3011714..181be528c61 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -129,7 +129,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define pte_unmap(pte) /* NOP */ #define pte_unmap_nested(pte) /* NOP */ -#define update_mmu_cache(vma, address, pte) do { } while (0) +#define update_mmu_cache(vma, address, ptep) do { } while (0) /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index fc801bab1b3..b753ea59703 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -450,6 +450,8 @@ struct thread_struct { struct perf_event *ptrace_bps[HBP_NUM]; /* Debug status used for traps, single steps, etc... */ unsigned long debugreg6; + /* Keep track of the exact dr7 value set by the user */ + unsigned long ptrace_dr7; /* Fault info: */ unsigned long cr2; unsigned long trap_no; diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 4009f6534f5..6f414ed8862 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -23,14 +23,4 @@ extern int reboot_force; long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); -/* - * This looks more complex than it should be. But we need to - * get the type for the ~ right in round_down (it needs to be - * as wide as the result!), and we want to evaluate the macro - * arguments just once each. - */ -#define __round_mask(x,y) ((__typeof__(x))((y)-1)) -#define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1) -#define round_down(x,y) ((x) & ~__round_mask(x,y)) - #endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 9d369f68032..69a686a7dff 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -274,18 +274,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, return 0; } -/* Get Nth argument at function call */ -extern unsigned long regs_get_argument_nth(struct pt_regs *regs, - unsigned int n); - -/* - * These are defined as per linux/ptrace.h, which see. - */ #define arch_has_single_step() (1) -extern void user_enable_single_step(struct task_struct *); -extern void user_disable_single_step(struct task_struct *); - -extern void user_enable_block_step(struct task_struct *); #ifdef CONFIG_X86_DEBUGCTLMSR #define arch_has_block_step() (1) #else diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 41362002476..606ede12697 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -41,6 +41,7 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/lockdep.h> +#include <asm/asm.h> struct rwsem_waiter; @@ -55,17 +56,28 @@ extern asmregparm struct rw_semaphore * /* * the semaphore definition + * + * The bias values and the counter type limits the number of + * potential readers/writers to 32767 for 32 bits and 2147483647 + * for 64 bits. */ -#define RWSEM_UNLOCKED_VALUE 0x00000000 -#define RWSEM_ACTIVE_BIAS 0x00000001 -#define RWSEM_ACTIVE_MASK 0x0000ffff -#define RWSEM_WAITING_BIAS (-0x00010000) +#ifdef CONFIG_X86_64 +# define RWSEM_ACTIVE_MASK 0xffffffffL +#else +# define RWSEM_ACTIVE_MASK 0x0000ffffL +#endif + +#define RWSEM_UNLOCKED_VALUE 0x00000000L +#define RWSEM_ACTIVE_BIAS 0x00000001L +#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) +typedef signed long rwsem_count_t; + struct rw_semaphore { - signed long count; + rwsem_count_t count; spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -105,7 +117,7 @@ do { \ static inline void __down_read(struct rw_semaphore *sem) { asm volatile("# beginning down_read\n\t" - LOCK_PREFIX " inc%z0 (%1)\n\t" + LOCK_PREFIX _ASM_INC "(%1)\n\t" /* adds 0x00000001, returns the old value */ " jns 1f\n" " call call_rwsem_down_read_failed\n" @@ -121,7 +133,7 @@ static inline void __down_read(struct rw_semaphore *sem) */ static inline int __down_read_trylock(struct rw_semaphore *sem) { - __s32 result, tmp; + rwsem_count_t result, tmp; asm volatile("# beginning __down_read_trylock\n\t" " mov %0,%1\n\t" "1:\n\t" @@ -143,7 +155,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) { - int tmp; + rwsem_count_t tmp; tmp = RWSEM_ACTIVE_WRITE_BIAS; asm volatile("# beginning down_write\n\t" @@ -170,9 +182,9 @@ static inline void __down_write(struct rw_semaphore *sem) */ static inline int __down_write_trylock(struct rw_semaphore *sem) { - signed long ret = cmpxchg(&sem->count, - RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); + rwsem_count_t ret = cmpxchg(&sem->count, + RWSEM_UNLOCKED_VALUE, + RWSEM_ACTIVE_WRITE_BIAS); if (ret == RWSEM_UNLOCKED_VALUE) return 1; return 0; @@ -183,7 +195,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) */ static inline void __up_read(struct rw_semaphore *sem) { - __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; + rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS; asm volatile("# beginning __up_read\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 1, returns the old value */ @@ -201,7 +213,7 @@ static inline void __up_read(struct rw_semaphore *sem) */ static inline void __up_write(struct rw_semaphore *sem) { - unsigned long tmp; + rwsem_count_t tmp; asm volatile("# beginning __up_write\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* tries to transition @@ -221,33 +233,38 @@ static inline void __up_write(struct rw_semaphore *sem) static inline void __downgrade_write(struct rw_semaphore *sem) { asm volatile("# beginning __downgrade_write\n\t" - LOCK_PREFIX " add%z0 %2,(%1)\n\t" - /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ + LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t" + /* + * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386) + * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64) + */ " jns 1f\n\t" " call call_rwsem_downgrade_wake\n" "1:\n\t" "# ending __downgrade_write\n" : "+m" (sem->count) - : "a" (sem), "i" (-RWSEM_WAITING_BIAS) + : "a" (sem), "er" (-RWSEM_WAITING_BIAS) : "memory", "cc"); } /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(rwsem_count_t delta, + struct rw_semaphore *sem) { - asm volatile(LOCK_PREFIX "add%z0 %1,%0" + asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0" : "+m" (sem->count) - : "ir" (delta)); + : "er" (delta)); } /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta, + struct rw_semaphore *sem) { - int tmp = delta; + rwsem_count_t tmp = delta; asm volatile(LOCK_PREFIX "xadd %0,%1" : "+r" (tmp), "+m" (sem->count) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 18e496c98ff..86b1506f417 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -37,10 +37,8 @@ void setup_bios_corruption_check(void); #ifdef CONFIG_X86_VISWS extern void visws_early_detect(void); -extern int is_visws_box(void); #else static inline void visws_early_detect(void) { } -static inline int is_visws_box(void) { return 0; } #endif extern unsigned long saved_video_mode; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 1e796782cd7..4cfc9082406 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -135,6 +135,8 @@ int native_cpu_disable(void); void native_cpu_die(unsigned int cpu); void native_play_dead(void); void play_dead_common(void); +void wbinvd_on_cpu(int cpu); +int wbinvd_on_all_cpus(void); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); @@ -147,6 +149,13 @@ static inline int num_booting_cpus(void) { return cpumask_weight(cpu_callout_mask); } +#else /* !CONFIG_SMP */ +#define wbinvd_on_cpu(cpu) wbinvd() +static inline int wbinvd_on_all_cpus(void) +{ + wbinvd(); + return 0; +} #endif /* CONFIG_SMP */ extern unsigned disabled_cpus __cpuinitdata; diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 35e89122a42..4dab78edbad 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -3,8 +3,6 @@ extern int kstack_depth_to_print; -int x86_is_stack_id(int id, char *name); - struct thread_info; struct stacktrace_ops; diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 1fecb7e6113..38638cd2fa4 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -313,7 +313,7 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_EXIT_ERR -1 -#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ +#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index d5f69045c10..3ad421784ae 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -26,8 +26,8 @@ asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *); asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *); asmlinkage long sys32_fstatat(unsigned int, char __user *, struct stat64 __user *, int); -struct mmap_arg_struct; -asmlinkage long sys32_mmap(struct mmap_arg_struct __user *); +struct mmap_arg_struct32; +asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *); asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long); struct sigaction32; @@ -40,8 +40,6 @@ asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *, compat_sigset_t __user *, unsigned int); asmlinkage long sys32_alarm(unsigned int); -struct sel_arg_struct; -asmlinkage long sys32_old_select(struct sel_arg_struct __user *); asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int); asmlinkage long sys32_sysfs(int, u32, u32); @@ -56,11 +54,6 @@ asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); asmlinkage long sys32_personality(unsigned long); asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); -struct oldold_utsname; -struct old_utsname; -asmlinkage long sys32_olduname(struct oldold_utsname __user *); -long sys32_uname(struct old_utsname __user *); - asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *, compat_uptr_t __user *, struct pt_regs *); asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *); diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 8d33bc5462d..c4a348f7bd4 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -16,6 +16,8 @@ #include <linux/sched.h> #include <linux/err.h> +extern const unsigned long sys_call_table[]; + /* * Only the low 32 bits of orig_ax are meaningful, so we return int. * This importantly ignores the high bits on 64-bit, so comparisons diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 8868b9420b0..5c044b43e9a 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -50,18 +50,6 @@ asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, struct old_sigaction __user *); unsigned long sys_sigreturn(struct pt_regs *); -/* kernel/sys_i386_32.c */ -struct mmap_arg_struct; -struct sel_arg_struct; -struct oldold_utsname; -struct old_utsname; - -asmlinkage int old_mmap(struct mmap_arg_struct __user *); -asmlinkage int old_select(struct sel_arg_struct __user *); -asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); -asmlinkage int sys_uname(struct old_utsname __user *); -asmlinkage int sys_olduname(struct oldold_utsname __user *); - /* kernel/vm86_32.c */ int sys_vm86old(struct vm86_struct __user *, struct pt_regs *); int sys_vm86(unsigned long, unsigned long, struct pt_regs *); @@ -73,11 +61,8 @@ int sys_vm86(unsigned long, unsigned long, struct pt_regs *); long sys_arch_prctl(int, unsigned long); /* kernel/sys_x86_64.c */ -struct new_utsname; - asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -asmlinkage long sys_uname(struct new_utsname __user *); #endif /* CONFIG_X86_32 */ #endif /* _ASM_X86_SYSCALLS_H */ diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index ecb544e6538..b8fe48ee2ed 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -11,9 +11,9 @@ #include <linux/irqflags.h> /* entries in ARCH_DLINFO: */ -#ifdef CONFIG_IA32_EMULATION +#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64) # define AT_VECTOR_SIZE_ARCH 2 -#else +#else /* else it's non-compat x86-64 */ # define AT_VECTOR_SIZE_ARCH 1 #endif @@ -32,7 +32,7 @@ extern void show_regs_common(void); "movl %P[task_canary](%[next]), %%ebx\n\t" \ "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" #define __switch_canary_oparam \ - , [stack_canary] "=m" (per_cpu_var(stack_canary.canary)) + , [stack_canary] "=m" (stack_canary.canary) #define __switch_canary_iparam \ , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) #else /* CC_STACKPROTECTOR */ @@ -114,7 +114,7 @@ do { \ "movq %P[task_canary](%%rsi),%%r8\n\t" \ "movq %%r8,"__percpu_arg([gs_canary])"\n\t" #define __switch_canary_oparam \ - , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary)) + , [gs_canary] "=m" (irq_stack_union.stack_canary) #define __switch_canary_iparam \ , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) #else /* CC_STACKPROTECTOR */ @@ -133,7 +133,7 @@ do { \ __switch_canary \ "movq %P[thread_info](%%rsi),%%r8\n\t" \ "movq %%rax,%%rdi\n\t" \ - "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ + "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ "jnz ret_from_fork\n\t" \ RESTORE_CONTEXT \ : "=a" (last) \ @@ -143,7 +143,7 @@ do { \ [ti_flags] "i" (offsetof(struct thread_info, flags)), \ [_tif_fork] "i" (_TIF_FORK), \ [thread_info] "i" (offsetof(struct task_struct, stack)), \ - [current_task] "m" (per_cpu_var(current_task)) \ + [current_task] "m" (current_task) \ __switch_canary_iparam \ : "memory", "cc" __EXTRA_CLOBBER) #endif diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 375c917c37d..e0d28901e96 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -87,7 +87,6 @@ struct thread_info { #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ -#define TIF_ABI_PENDING 19 #define TIF_MEMDIE 20 #define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @@ -112,7 +111,6 @@ struct thread_info { #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) -#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING) #define _TIF_DEBUG (1 << TIF_DEBUG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FREEZE (1 << TIF_FREEZE) diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 0c9825e97f3..088d09fb161 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -205,14 +205,13 @@ static inline unsigned long __must_check copy_from_user(void *to, unsigned long n) { int sz = __compiletime_object_size(to); - int ret = -EFAULT; if (likely(sz == -1 || sz >= n)) - ret = _copy_from_user(to, from, n); + n = _copy_from_user(to, from, n); else copy_from_user_overflow(); - return ret; + return n; } long __must_check strncpy_from_user(char *dst, const char __user *src, diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index a78c4030544..316708d5af9 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -49,16 +49,15 @@ static inline unsigned long __must_check copy_from_user(void *to, unsigned long n) { int sz = __compiletime_object_size(to); - int ret = -EFAULT; might_fault(); if (likely(sz == -1 || sz >= n)) - ret = _copy_from_user(to, from, n); + n = _copy_from_user(to, from, n); #ifdef CONFIG_DEBUG_VM else WARN(1, "Buffer overflow detected!\n"); #endif - return ret; + return n; } static __always_inline __must_check diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 3baf379fa84..beb9b5f8f8a 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -354,6 +354,7 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE #define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL @@ -366,6 +367,9 @@ #define __ARCH_WANT_SYS_LLSEEK #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLD_UNAME +#define __ARCH_WANT_SYS_OLD_MMAP +#define __ARCH_WANT_SYS_OLD_SELECT #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 4843f7ba754..ff4307b0e81 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -146,7 +146,7 @@ __SYSCALL(__NR_wait4, sys_wait4) #define __NR_kill 62 __SYSCALL(__NR_kill, sys_kill) #define __NR_uname 63 -__SYSCALL(__NR_uname, sys_uname) +__SYSCALL(__NR_uname, sys_newuname) #define __NR_semget 64 __SYSCALL(__NR_semget, sys_semget) @@ -680,6 +680,7 @@ __SYSCALL(__NR_recvmmsg, sys_recvmmsg) #define __ARCH_WANT_SYS_LLSEEK #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLD_UNAME #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h index 999873b22e7..24532c7da3d 100644 --- a/arch/x86/include/asm/user.h +++ b/arch/x86/include/asm/user.h @@ -1,5 +1,63 @@ +#ifndef _ASM_X86_USER_H +#define _ASM_X86_USER_H + #ifdef CONFIG_X86_32 # include "user_32.h" #else # include "user_64.h" #endif + +#include <asm/types.h> + +struct user_ymmh_regs { + /* 16 * 16 bytes for each YMMH-reg */ + __u32 ymmh_space[64]; +}; + +struct user_xsave_hdr { + __u64 xstate_bv; + __u64 reserved1[2]; + __u64 reserved2[5]; +}; + +/* + * The structure layout of user_xstateregs, used for exporting the + * extended register state through ptrace and core-dump (NT_X86_XSTATE note) + * interfaces will be same as the memory layout of xsave used by the processor + * (except for the bytes 464..511, which can be used by the software) and hence + * the size of this structure varies depending on the features supported by the + * processor and OS. The size of the structure that users need to use can be + * obtained by doing: + * cpuid_count(0xd, 0, &eax, &ptrace_xstateregs_struct_size, &ecx, &edx); + * i.e., cpuid.(eax=0xd,ecx=0).ebx will be the size that user (debuggers, etc.) + * need to use. + * + * For now, only the first 8 bytes of the software usable bytes[464..471] will + * be used and will be set to OS enabled xstate mask (which is same as the + * 64bit mask returned by the xgetbv's xCR0). Users (analyzing core dump + * remotely, etc.) can use this mask as well as the mask saved in the + * xstate_hdr bytes and interpret what states the processor/OS supports + * and what states are in modified/initialized conditions for the + * particular process/thread. + * + * Also when the user modifies certain state FP/SSE/etc through the + * ptrace interface, they must ensure that the xsave_hdr.xstate_bv + * bytes[512..519] of the memory layout are updated correspondingly. + * i.e., for example when FP state is modified to a non-init state, + * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to + * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc. + */ +#define USER_XSTATE_FX_SW_WORDS 6 +#define USER_XSTATE_XCR0_WORD 0 + +struct user_xstateregs { + struct { + __u64 fpx_space[58]; + __u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS]; + } i387; + struct user_xsave_hdr xsave_hdr; + struct user_ymmh_regs ymmh; + /* further processor state extensions go here */ +}; + +#endif /* _ASM_X86_USER_H */ diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 2751f3075d8..71605c7d5c5 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -18,8 +18,8 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson + * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson <rja@sgi.com> */ #include <linux/rtc.h> @@ -36,7 +36,8 @@ enum uv_bios_cmd { UV_BIOS_WATCHLIST_ALLOC, UV_BIOS_WATCHLIST_FREE, UV_BIOS_MEMPROTECT, - UV_BIOS_GET_PARTITION_ADDR + UV_BIOS_GET_PARTITION_ADDR, + UV_BIOS_SET_LEGACY_VGA_TARGET }; /* @@ -89,13 +90,14 @@ extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); -extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); +extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int, unsigned long *); extern int uv_bios_mq_watchlist_free(int, int); extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); +extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus); extern void uv_bios_init(void); @@ -104,6 +106,7 @@ extern int uv_type; extern long sn_partition_id; extern long sn_coherency_id; extern long sn_region_size; +extern long system_serial_number; #define partition_coherence_id() (sn_coherency_id) extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index c0a01b5d985..3bb9491b765 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -11,6 +11,7 @@ struct mm_struct; extern enum uv_system_type get_uv_system_type(void); extern int is_uv_system(void); extern void uv_cpu_init(void); +extern void uv_nmi_init(void); extern void uv_system_init(void); extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 811bfabc80b..14cc74ba5d2 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -31,20 +31,20 @@ * contiguous (although various IO spaces may punch holes in * it).. * - * N - Number of bits in the node portion of a socket physical - * address. + * N - Number of bits in the node portion of a socket physical + * address. * - * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of - * routers always have low bit of 1, C/MBricks have low bit - * equal to 0. Most addressing macros that target UV hub chips - * right shift the NASID by 1 to exclude the always-zero bit. - * NASIDs contain up to 15 bits. + * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of + * routers always have low bit of 1, C/MBricks have low bit + * equal to 0. Most addressing macros that target UV hub chips + * right shift the NASID by 1 to exclude the always-zero bit. + * NASIDs contain up to 15 bits. * * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead * of nasids. * - * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant - * of the nasid for socket usage. + * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant + * of the nasid for socket usage. * * * NumaLink Global Physical Address Format: @@ -71,12 +71,12 @@ * * * APICID format - * NOTE!!!!!! This is the current format of the APICID. However, code - * should assume that this will change in the future. Use functions - * in this file for all APICID bit manipulations and conversion. + * NOTE!!!!!! This is the current format of the APICID. However, code + * should assume that this will change in the future. Use functions + * in this file for all APICID bit manipulations and conversion. * - * 1111110000000000 - * 5432109876543210 + * 1111110000000000 + * 5432109876543210 * pppppppppplc0cch * sssssssssss * @@ -89,9 +89,9 @@ * Note: Processor only supports 12 bits in the APICID register. The ACPI * tables hold all 16 bits. Software needs to be aware of this. * - * Unless otherwise specified, all references to APICID refer to - * the FULL value contained in ACPI tables, not the subset in the - * processor APICID register. + * Unless otherwise specified, all references to APICID refer to + * the FULL value contained in ACPI tables, not the subset in the + * processor APICID register. */ @@ -151,16 +151,16 @@ struct uv_hub_info_s { }; DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); -#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) +#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) #define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) /* * Local & Global MMR space macros. - * Note: macros are intended to be used ONLY by inline functions - * in this file - not by other kernel code. - * n - NASID (full 15-bit global nasid) - * g - GNODE (full 15-bit global nasid, right shifted 1) - * p - PNODE (local part of nsids, right shifted 1) + * Note: macros are intended to be used ONLY by inline functions + * in this file - not by other kernel code. + * n - NASID (full 15-bit global nasid) + * g - GNODE (full 15-bit global nasid, right shifted 1) + * p - PNODE (local part of nsids, right shifted 1) */ #define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) #define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) @@ -215,8 +215,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); /* * Macros for converting between kernel virtual addresses, socket local physical * addresses, and UV global physical addresses. - * Note: use the standard __pa() & __va() macros for converting - * between socket virtual and socket physical addresses. + * Note: use the standard __pa() & __va() macros for converting + * between socket virtual and socket physical addresses. */ /* socket phys RAM --> UV global physical address */ @@ -287,21 +287,18 @@ static inline int uv_apicid_to_pnode(int apicid) * Access global MMRs using the low memory MMR32 space. This region supports * faster MMR access but not all MMRs are accessible in this space. */ -static inline unsigned long *uv_global_mmr32_address(int pnode, - unsigned long offset) +static inline unsigned long *uv_global_mmr32_address(int pnode, unsigned long offset) { return __va(UV_GLOBAL_MMR32_BASE | UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset); } -static inline void uv_write_global_mmr32(int pnode, unsigned long offset, - unsigned long val) +static inline void uv_write_global_mmr32(int pnode, unsigned long offset, unsigned long val) { writeq(val, uv_global_mmr32_address(pnode, offset)); } -static inline unsigned long uv_read_global_mmr32(int pnode, - unsigned long offset) +static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset) { return readq(uv_global_mmr32_address(pnode, offset)); } @@ -310,21 +307,18 @@ static inline unsigned long uv_read_global_mmr32(int pnode, * Access Global MMR space using the MMR space located at the top of physical * memory. */ -static inline unsigned long *uv_global_mmr64_address(int pnode, - unsigned long offset) +static inline unsigned long *uv_global_mmr64_address(int pnode, unsigned long offset) { return __va(UV_GLOBAL_MMR64_BASE | UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset); } -static inline void uv_write_global_mmr64(int pnode, unsigned long offset, - unsigned long val) +static inline void uv_write_global_mmr64(int pnode, unsigned long offset, unsigned long val) { writeq(val, uv_global_mmr64_address(pnode, offset)); } -static inline unsigned long uv_read_global_mmr64(int pnode, - unsigned long offset) +static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset) { return readq(uv_global_mmr64_address(pnode, offset)); } @@ -335,7 +329,18 @@ static inline unsigned long uv_read_global_mmr64(int pnode, */ static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset) { - return UV_GLOBAL_GRU_MMR_BASE | offset | (pnode << uv_hub_info->m_val); + return UV_GLOBAL_GRU_MMR_BASE | offset | + ((unsigned long)pnode << uv_hub_info->m_val); +} + +static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val) +{ + writeb(val, uv_global_mmr64_address(pnode, offset)); +} + +static inline unsigned char uv_read_global_mmr8(int pnode, unsigned long offset) +{ + return readb(uv_global_mmr64_address(pnode, offset)); } /* @@ -457,11 +462,17 @@ static inline void uv_set_scir_bits(unsigned char value) } } +static inline unsigned long uv_scir_offset(int apicid) +{ + return SCIR_LOCAL_MMR_BASE | (apicid & 0x3f); +} + static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) { if (uv_cpu_hub_info(cpu)->scir.state != value) { + uv_write_global_mmr8(uv_cpu_to_pnode(cpu), + uv_cpu_hub_info(cpu)->scir.offset, value); uv_cpu_hub_info(cpu)->scir.state = value; - uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value); } } @@ -485,5 +496,17 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } +/* + * Get the minimum revision number of the hub chips within the partition. + * 1 - initial rev 1.0 silicon + * 2 - rev 2.0 production silicon + */ +static inline int uv_get_min_hub_revision_id(void) +{ + extern int uv_min_hub_revision_id; + + return uv_min_hub_revision_id; +} + #endif /* CONFIG_X86_64 */ #endif /* _ASM_X86_UV_UV_HUB_H */ diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h index 166adf61e77..2edb37637ea 100644 --- a/arch/x86/include/asm/visws/cobalt.h +++ b/arch/x86/include/asm/visws/cobalt.h @@ -122,4 +122,6 @@ extern char visws_board_type; extern char visws_board_rev; +extern int pci_visws_init(void); + #endif /* _ASM_X86_VISWS_COBALT_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2b4945419a8..fb9a080740e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -53,6 +53,7 @@ */ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 +#define SECONDARY_EXEC_RDTSCP 0x00000008 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 @@ -251,6 +252,7 @@ enum vmcs_field { #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MONITOR_INSTRUCTION 39 #define EXIT_REASON_PAUSE_INSTRUCTION 40 #define EXIT_REASON_MCE_DURING_VMENTRY 41 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 @@ -362,6 +364,7 @@ enum vmcs_field { #define VMX_EPTP_UC_BIT (1ull << 8) #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) +#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) @@ -374,7 +377,7 @@ enum vmcs_field { #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull -#define VMX_EPT_IGMT_BIT (1ull << 6) +#define VMX_EPT_IPAT_BIT (1ull << 6) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index ea0e8ea15e1..519b54327d7 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -99,6 +99,20 @@ struct x86_init_iommu { }; /** + * struct x86_init_pci - platform specific pci init functions + * @arch_init: platform specific pci arch init call + * @init: platform specific pci subsystem init + * @init_irq: platform specific pci irq init + * @fixup_irqs: platform specific pci irq fixup + */ +struct x86_init_pci { + int (*arch_init)(void); + int (*init)(void); + void (*init_irq)(void); + void (*fixup_irqs)(void); +}; + +/** * struct x86_init_ops - functions for platform specific setup * */ @@ -110,6 +124,7 @@ struct x86_init_ops { struct x86_init_paging paging; struct x86_init_timers timers; struct x86_init_iommu iommu; + struct x86_init_pci pci; }; /** @@ -126,6 +141,7 @@ struct x86_cpuinit_ops { * @get_wallclock: get time from HW clock like RTC etc. * @set_wallclock: set time back to HW clock * @is_untracked_pat_range exclude from PAT logic + * @nmi_init enable NMI on cpus */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); @@ -133,6 +149,7 @@ struct x86_platform_ops { int (*set_wallclock)(unsigned long nowtime); void (*iommu_shutdown)(void); bool (*is_untracked_pat_range)(u64 start, u64 end); + void (*nmi_init)(void); }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 727acc15234..ddc04ccad03 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -27,9 +27,11 @@ extern unsigned int xstate_size; extern u64 pcntxt_mask; extern struct xsave_struct *init_xstate_buf; +extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void xsave_cntxt_init(void); extern void xsave_init(void); +extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); extern int init_fpu(struct task_struct *child); extern int check_for_xstate(struct i387_fxsave_struct __user *buf, void __user *fpstate, diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d87f09bc5a5..4c58352209e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -87,6 +87,7 @@ obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o +obj-$(CONFIG_APB_TIMER) += apb_timer.o obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index fb1035cd9a6..cd40aba6aa9 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -31,10 +31,12 @@ #include <linux/module.h> #include <linux/dmi.h> #include <linux/irq.h> +#include <linux/slab.h> #include <linux/bootmem.h> #include <linux/ioport.h> #include <linux/pci.h> +#include <asm/pci_x86.h> #include <asm/pgtable.h> #include <asm/io_apic.h> #include <asm/apic.h> @@ -49,6 +51,7 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 # include <asm/proto.h> +# include <asm/numa_64.h> #endif /* X86 */ #define BAD_MADT_ENTRY(entry, end) ( \ @@ -446,6 +449,12 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) { *irq = gsi; + +#ifdef CONFIG_X86_IO_APIC + if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) + setup_IO_APIC_irq_extra(gsi); +#endif + return 0; } @@ -473,7 +482,8 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); } #endif - acpi_gsi_to_irq(plat_gsi, &irq); + irq = plat_gsi; + return irq; } @@ -481,6 +491,26 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) * ACPI based hotplug support for CPU */ #ifdef CONFIG_ACPI_HOTPLUG_CPU +#include <acpi/processor.h> + +static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +{ +#ifdef CONFIG_ACPI_NUMA + int nid; + + nid = acpi_get_node(handle); + if (nid == -1 || !node_online(nid)) + return; +#ifdef CONFIG_X86_64 + apicid_to_node[physid] = nid; + numa_set_node(cpu, nid); +#else /* CONFIG_X86_32 */ + apicid_2_node[physid] = nid; + cpu_to_node_map[cpu] = nid; +#endif + +#endif +} static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) { @@ -539,7 +569,10 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) goto free_new_map; } + acpi_processor_set_pdc(handle); + cpu = cpumask_first(new_map); + acpi_map_cpu2node(handle, cpu, physid); *pcpu = cpu; retval = 0; @@ -1185,9 +1218,6 @@ static void __init acpi_process_madt(void) if (!error) { acpi_lapic = 1; -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif /* * Parse MADT IO-APIC entries */ @@ -1197,8 +1227,6 @@ static void __init acpi_process_madt(void) acpi_ioapic = 1; smp_found_config = 1; - if (apic->setup_apic_routing) - apic->setup_apic_routing(); } } if (error == -EINVAL) { @@ -1269,23 +1297,6 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d) } /* - * Limit ACPI to CPU enumeration for HT - */ -static int __init force_acpi_ht(const struct dmi_system_id *d) -{ - if (!acpi_force) { - printk(KERN_NOTICE "%s detected: force use of acpi=ht\n", - d->ident); - disable_acpi(); - acpi_ht = 1; - } else { - printk(KERN_NOTICE - "Warning: acpi=force overrules DMI blacklist: acpi=ht\n"); - } - return 0; -} - -/* * Force ignoring BIOS IRQ0 pin2 override */ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) @@ -1321,90 +1332,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { }, /* - * Boxes that need acpi=ht - */ - { - .callback = force_acpi_ht, - .ident = "FSC Primergy T850", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), - DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "HP VISUALIZE NT Workstation", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), - DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "Compaq Workstation W8000", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), - DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "ASUS P2B-DS", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "ASUS CUR-DLS", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "ABIT i440BX-W83977", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"), - DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "IBM Bladecenter", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "IBM eServer xSeries 360", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "IBM eserver xSeries 330", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "IBM eserver xSeries 440", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"), - }, - }, - - /* * Boxes that need ACPI PCI IRQ routing disabled */ { @@ -1529,16 +1456,10 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { * if acpi_blacklisted() acpi_disabled = 1; * acpi_irq_model=... * ... - * - * return value: (currently ignored) - * 0: success - * !0: failure */ -int __init acpi_boot_table_init(void) +void __init acpi_boot_table_init(void) { - int error; - dmi_check_system(acpi_dmi_table); /* @@ -1546,15 +1467,14 @@ int __init acpi_boot_table_init(void) * One exception: acpi=ht continues far enough to enumerate LAPICs */ if (acpi_disabled && !acpi_ht) - return 1; + return; /* * Initialize the ACPI boot-time table parser. */ - error = acpi_table_init(); - if (error) { + if (acpi_table_init()) { disable_acpi(); - return error; + return; } acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); @@ -1562,18 +1482,15 @@ int __init acpi_boot_table_init(void) /* * blacklist may disable ACPI entirely */ - error = acpi_blacklisted(); - if (error) { + if (acpi_blacklisted()) { if (acpi_force) { printk(KERN_WARNING PREFIX "acpi=force override\n"); } else { printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); disable_acpi(); - return error; + return; } } - - return 0; } int __init early_acpi_boot_init(void) @@ -1619,6 +1536,9 @@ int __init acpi_boot_init(void) acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); + if (!acpi_noirq) + x86_init.pci.init = pci_acpi_init; + return 0; } @@ -1643,8 +1563,10 @@ static int __init parse_acpi(char *arg) } /* Limit ACPI just to boot-time to enable HT */ else if (strcmp(arg, "ht") == 0) { - if (!acpi_force) + if (!acpi_force) { + printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n"); disable_acpi(); + } acpi_ht = 1; } /* acpi=rsdt use RSDT instead of XSDT */ diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 82e508677b9..f9961034e55 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -162,6 +162,8 @@ static int __init acpi_sleep_setup(char *str) #endif if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); + if (strncmp(str, "sci_force_enable", 16) == 0) + acpi_set_sci_en_on_resume(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 80b222ea4cf..70237732a6c 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -7,6 +7,8 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> +#include <linux/stop_machine.h> +#include <linux/slab.h> #include <asm/alternative.h> #include <asm/sections.h> #include <asm/pgtable.h> @@ -192,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) } extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; -extern u8 *__smp_locks[], *__smp_locks_end[]; +extern s32 __smp_locks[], __smp_locks_end[]; static void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. @@ -233,39 +235,41 @@ void __init_or_module apply_alternatives(struct alt_instr *start, #ifdef CONFIG_SMP -static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) +static void alternatives_smp_lock(const s32 *start, const s32 *end, + u8 *text, u8 *text_end) { - u8 **ptr; + const s32 *poff; mutex_lock(&text_mutex); - for (ptr = start; ptr < end; ptr++) { - if (*ptr < text) - continue; - if (*ptr > text_end) + for (poff = start; poff < end; poff++) { + u8 *ptr = (u8 *)poff + *poff; + + if (!*poff || ptr < text || ptr >= text_end) continue; /* turn DS segment override prefix into lock prefix */ - if (**ptr == 0x3e) - text_poke(*ptr, ((unsigned char []){0xf0}), 1); + if (*ptr == 0x3e) + text_poke(ptr, ((unsigned char []){0xf0}), 1); }; mutex_unlock(&text_mutex); } -static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) +static void alternatives_smp_unlock(const s32 *start, const s32 *end, + u8 *text, u8 *text_end) { - u8 **ptr; + const s32 *poff; if (noreplace_smp) return; mutex_lock(&text_mutex); - for (ptr = start; ptr < end; ptr++) { - if (*ptr < text) - continue; - if (*ptr > text_end) + for (poff = start; poff < end; poff++) { + u8 *ptr = (u8 *)poff + *poff; + + if (!*poff || ptr < text || ptr >= text_end) continue; /* turn lock prefix into DS segment override prefix */ - if (**ptr == 0xf0) - text_poke(*ptr, ((unsigned char []){0x3E}), 1); + if (*ptr == 0xf0) + text_poke(ptr, ((unsigned char []){0x3E}), 1); }; mutex_unlock(&text_mutex); } @@ -276,8 +280,8 @@ struct smp_alt_module { char *name; /* ptrs to lock prefixes */ - u8 **locks; - u8 **locks_end; + const s32 *locks; + const s32 *locks_end; /* .text segment, needed to avoid patching init code ;) */ u8 *text; @@ -394,6 +398,27 @@ void alternatives_smp_switch(int smp) mutex_unlock(&smp_alt); } +/* Return 1 if the address range is reserved for smp-alternatives */ +int alternatives_text_reserved(void *start, void *end) +{ + struct smp_alt_module *mod; + const s32 *poff; + u8 *text_start = start; + u8 *text_end = end; + + list_for_each_entry(mod, &smp_alt_modules, next) { + if (mod->text > text_end || mod->text_end < text_start) + continue; + for (poff = mod->locks; poff < mod->locks_end; poff++) { + const u8 *ptr = (const u8 *)poff + *poff; + + if (text_start <= ptr && text_end > ptr) + return 1; + } + } + + return 0; +} #endif #ifdef CONFIG_PARAVIRT @@ -556,3 +581,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) local_irq_restore(flags); return addr; } + +/* + * Cross-modifying kernel text with stop_machine(). + * This code originally comes from immediate value. + */ +static atomic_t stop_machine_first; +static int wrote_text; + +struct text_poke_params { + void *addr; + const void *opcode; + size_t len; +}; + +static int __kprobes stop_machine_text_poke(void *data) +{ + struct text_poke_params *tpp = data; + + if (atomic_dec_and_test(&stop_machine_first)) { + text_poke(tpp->addr, tpp->opcode, tpp->len); + smp_wmb(); /* Make sure other cpus see that this has run */ + wrote_text = 1; + } else { + while (!wrote_text) + cpu_relax(); + smp_mb(); /* Load wrote_text before following execution */ + } + + flush_icache_range((unsigned long)tpp->addr, + (unsigned long)tpp->addr + tpp->len); + return 0; +} + +/** + * text_poke_smp - Update instructions on a live kernel on SMP + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Modify multi-byte instruction by using stop_machine() on SMP. This allows + * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying + * should be allowed, since stop_machine() does _not_ protect code against + * NMI and MCE. + * + * Note: Must be called under get_online_cpus() and text_mutex. + */ +void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) +{ + struct text_poke_params tpp; + + tpp.addr = addr; + tpp.opcode = opcode; + tpp.len = len; + atomic_set(&stop_machine_first, 1); + wrote_text = 0; + stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); + return addr; +} + diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 23824fef789..f854d89b7ed 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -18,8 +18,8 @@ */ #include <linux/pci.h> -#include <linux/gfp.h> #include <linux/bitmap.h> +#include <linux/slab.h> #include <linux/debugfs.h> #include <linux/scatterlist.h> #include <linux/dma-mapping.h> @@ -118,7 +118,7 @@ static bool check_device(struct device *dev) return false; /* No device or no PCI device */ - if (!dev || dev->bus != &pci_bus_type) + if (dev->bus != &pci_bus_type) return false; devid = get_device_id(dev); @@ -392,6 +392,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) u32 tail, head; u8 *target; + WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED); tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); target = iommu->cmd_buf + tail; memcpy_toio(target, cmd, sizeof(*cmd)); @@ -980,7 +981,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom, { int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; struct amd_iommu *iommu; - int i; + unsigned long i; #ifdef CONFIG_IOMMU_STRESS populate = false; @@ -1489,11 +1490,14 @@ static void __detach_device(struct device *dev) { struct iommu_dev_data *dev_data = get_dev_data(dev); struct iommu_dev_data *alias_data; + struct protection_domain *domain; unsigned long flags; BUG_ON(!dev_data->domain); - spin_lock_irqsave(&dev_data->domain->lock, flags); + domain = dev_data->domain; + + spin_lock_irqsave(&domain->lock, flags); if (dev_data->alias != dev) { alias_data = get_dev_data(dev_data->alias); @@ -1504,13 +1508,15 @@ static void __detach_device(struct device *dev) if (atomic_dec_and_test(&dev_data->bind)) do_detach(dev); - spin_unlock_irqrestore(&dev_data->domain->lock, flags); + spin_unlock_irqrestore(&domain->lock, flags); /* * If we run in passthrough mode the device must be assigned to the - * passthrough domain if it is detached from any other domain + * passthrough domain if it is detached from any other domain. + * Make sure we can deassign from the pt_domain itself. */ - if (iommu_pass_through && dev_data->domain == NULL) + if (iommu_pass_through && + (dev_data->domain == NULL && domain != pt_domain)) __attach_device(dev, pt_domain); } @@ -2181,7 +2187,7 @@ static void prealloc_protection_domains(void) struct dma_ops_domain *dma_dom; u16 devid; - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + for_each_pci_dev(dev) { /* Do we handle this device? */ if (!check_device(&dev->dev)) @@ -2218,6 +2224,12 @@ static struct dma_map_ops amd_iommu_dma_ops = { /* * The function which clues the AMD IOMMU driver into dma_ops. */ + +void __init amd_iommu_init_api(void) +{ + register_iommu(&amd_iommu_ops); +} + int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; @@ -2253,8 +2265,6 @@ int __init amd_iommu_init_dma_ops(void) /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; - register_iommu(&amd_iommu_ops); - amd_iommu_stats_init(); return 0; @@ -2289,7 +2299,7 @@ static void cleanup_domain(struct protection_domain *domain) list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { struct device *dev = dev_data->dev; - do_detach(dev); + __detach_device(dev); atomic_set(&dev_data->bind, 0); } @@ -2318,6 +2328,7 @@ static struct protection_domain *protection_domain_alloc(void) return NULL; spin_lock_init(&domain->lock); + mutex_init(&domain->api_lock); domain->id = domain_id_alloc(); if (!domain->id) goto out_err; @@ -2370,9 +2381,7 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom) free_pagetable(domain); - domain_id_free(domain->id); - - kfree(domain); + protection_domain_free(domain); dom->priv = NULL; } @@ -2447,6 +2456,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom, iova &= PAGE_MASK; paddr &= PAGE_MASK; + mutex_lock(&domain->api_lock); + for (i = 0; i < npages; ++i) { ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); if (ret) @@ -2456,6 +2467,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom, paddr += PAGE_SIZE; } + mutex_unlock(&domain->api_lock); + return 0; } @@ -2468,12 +2481,16 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom, iova &= PAGE_MASK; + mutex_lock(&domain->api_lock); + for (i = 0; i < npages; ++i) { iommu_unmap_page(domain, iova, PM_MAP_4k); iova += PAGE_SIZE; } iommu_flush_tlb_pde(domain); + + mutex_unlock(&domain->api_lock); } static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 1dca9c34eae..6360abf993d 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -19,8 +19,8 @@ #include <linux/pci.h> #include <linux/acpi.h> -#include <linux/gfp.h> #include <linux/list.h> +#include <linux/slab.h> #include <linux/sysdev.h> #include <linux/interrupt.h> #include <linux/msi.h> @@ -138,6 +138,11 @@ int amd_iommus_present; bool amd_iommu_np_cache __read_mostly; /* + * The ACPI table parsing functions set this variable on an error + */ +static int __initdata amd_iommu_init_err; + +/* * List of protection domains - used during resume */ LIST_HEAD(amd_iommu_pd_list); @@ -386,9 +391,11 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table) */ for (i = 0; i < table->length; ++i) checksum += p[i]; - if (checksum != 0) + if (checksum != 0) { /* ACPI table corrupt */ - return -ENODEV; + amd_iommu_init_err = -ENODEV; + return 0; + } p += IVRS_HEADER_LENGTH; @@ -431,7 +438,7 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) if (cmd_buf == NULL) return NULL; - iommu->cmd_buf_size = CMD_BUFFER_SIZE; + iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED; return cmd_buf; } @@ -467,12 +474,13 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu) &entry, sizeof(entry)); amd_iommu_reset_cmd_buffer(iommu); + iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED); } static void __init free_command_buffer(struct amd_iommu *iommu) { free_pages((unsigned long)iommu->cmd_buf, - get_order(iommu->cmd_buf_size)); + get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED))); } /* allocates the memory where the IOMMU will log its events to */ @@ -915,11 +923,16 @@ static int __init init_iommu_all(struct acpi_table_header *table) h->mmio_phys); iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); - if (iommu == NULL) - return -ENOMEM; + if (iommu == NULL) { + amd_iommu_init_err = -ENOMEM; + return 0; + } + ret = init_iommu_one(iommu, h); - if (ret) - return ret; + if (ret) { + amd_iommu_init_err = ret; + return 0; + } break; default: break; @@ -1204,6 +1217,10 @@ static int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) return -ENODEV; + ret = amd_iommu_init_err; + if (ret) + goto out; + dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); @@ -1263,9 +1280,19 @@ static int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", init_iommu_all) != 0) goto free; + if (amd_iommu_init_err) { + ret = amd_iommu_init_err; + goto free; + } + if (acpi_table_parse("IVRS", init_memory_definitions) != 0) goto free; + if (amd_iommu_init_err) { + ret = amd_iommu_init_err; + goto free; + } + ret = sysdev_class_register(&amd_iommu_sysdev_class); if (ret) goto free; @@ -1278,16 +1305,19 @@ static int __init amd_iommu_init(void) if (ret) goto free; + enable_iommus(); + if (iommu_pass_through) ret = amd_iommu_init_passthrough(); else ret = amd_iommu_init_dma_ops(); + if (ret) goto free; - amd_iommu_init_notifier(); + amd_iommu_init_api(); - enable_iommus(); + amd_iommu_init_notifier(); if (iommu_pass_through) goto out; @@ -1302,6 +1332,7 @@ out: return ret; free: + disable_iommus(); amd_iommu_uninit_devices(); diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c new file mode 100644 index 00000000000..ff469e47005 --- /dev/null +++ b/arch/x86/kernel/apb_timer.c @@ -0,0 +1,785 @@ +/* + * apb_timer.c: Driver for Langwell APB timers + * + * (C) Copyright 2009 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Note: + * Langwell is the south complex of Intel Moorestown MID platform. There are + * eight external timers in total that can be used by the operating system. + * The timer information, such as frequency and addresses, is provided to the + * OS via SFI tables. + * Timer interrupts are routed via FW/HW emulated IOAPIC independently via + * individual redirection table entries (RTE). + * Unlike HPET, there is no master counter, therefore one of the timers are + * used as clocksource. The overall allocation looks like: + * - timer 0 - NR_CPUs for per cpu timer + * - one timer for clocksource + * - one timer for watchdog driver. + * It is also worth notice that APB timer does not support true one-shot mode, + * free-running mode will be used here to emulate one-shot mode. + * APB timer can also be used as broadcast timer along with per cpu local APIC + * timer, but by default APB timer has higher rating than local APIC timers. + */ + +#include <linux/clocksource.h> +#include <linux/clockchips.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/sysdev.h> +#include <linux/slab.h> +#include <linux/pm.h> +#include <linux/pci.h> +#include <linux/sfi.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> +#include <linux/irq.h> + +#include <asm/fixmap.h> +#include <asm/apb_timer.h> + +#define APBT_MASK CLOCKSOURCE_MASK(32) +#define APBT_SHIFT 22 +#define APBT_CLOCKEVENT_RATING 150 +#define APBT_CLOCKSOURCE_RATING 250 +#define APBT_MIN_DELTA_USEC 200 + +#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt) +#define APBT_CLOCKEVENT0_NUM (0) +#define APBT_CLOCKEVENT1_NUM (1) +#define APBT_CLOCKSOURCE_NUM (2) + +static unsigned long apbt_address; +static int apb_timer_block_enabled; +static void __iomem *apbt_virt_address; +static int phy_cs_timer_id; + +/* + * Common DW APB timer info + */ +static uint64_t apbt_freq; + +static void apbt_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt); +static int apbt_next_event(unsigned long delta, + struct clock_event_device *evt); +static cycle_t apbt_read_clocksource(struct clocksource *cs); +static void apbt_restart_clocksource(struct clocksource *cs); + +struct apbt_dev { + struct clock_event_device evt; + unsigned int num; + int cpu; + unsigned int irq; + unsigned int tick; + unsigned int count; + unsigned int flags; + char name[10]; +}; + +int disable_apbt_percpu __cpuinitdata; + +static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); + +#ifdef CONFIG_SMP +static unsigned int apbt_num_timers_used; +static struct apbt_dev *apbt_devs; +#endif + +static inline unsigned long apbt_readl_reg(unsigned long a) +{ + return readl(apbt_virt_address + a); +} + +static inline void apbt_writel_reg(unsigned long d, unsigned long a) +{ + writel(d, apbt_virt_address + a); +} + +static inline unsigned long apbt_readl(int n, unsigned long a) +{ + return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE); +} + +static inline void apbt_writel(int n, unsigned long d, unsigned long a) +{ + writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE); +} + +static inline void apbt_set_mapping(void) +{ + struct sfi_timer_table_entry *mtmr; + + if (apbt_virt_address) { + pr_debug("APBT base already mapped\n"); + return; + } + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return; + } + apbt_address = (unsigned long)mtmr->phys_addr; + if (!apbt_address) { + printk(KERN_WARNING "No timer base from SFI, use default\n"); + apbt_address = APBT_DEFAULT_BASE; + } + apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); + if (apbt_virt_address) { + pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\ + (void *)apbt_address, (void *)apbt_virt_address); + } else { + pr_debug("Failed mapping APBT phy address at %p\n",\ + (void *)apbt_address); + goto panic_noapbt; + } + apbt_freq = mtmr->freq_hz / USEC_PER_SEC; + sfi_free_mtmr(mtmr); + + /* Now figure out the physical timer id for clocksource device */ + mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); + if (mtmr == NULL) + goto panic_noapbt; + + /* Now figure out the physical timer id */ + phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) + / APBTMRS_REG_SIZE; + pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id); + return; + +panic_noapbt: + panic("Failed to setup APB system timer\n"); + +} + +static inline void apbt_clear_mapping(void) +{ + iounmap(apbt_virt_address); + apbt_virt_address = NULL; +} + +/* + * APBT timer interrupt enable / disable + */ +static inline int is_apbt_capable(void) +{ + return apbt_virt_address ? 1 : 0; +} + +static struct clocksource clocksource_apbt = { + .name = "apbt", + .rating = APBT_CLOCKSOURCE_RATING, + .read = apbt_read_clocksource, + .mask = APBT_MASK, + .shift = APBT_SHIFT, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = apbt_restart_clocksource, +}; + +/* boot APB clock event device */ +static struct clock_event_device apbt_clockevent = { + .name = "apbt0", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = apbt_set_mode, + .set_next_event = apbt_next_event, + .shift = APBT_SHIFT, + .irq = 0, + .rating = APBT_CLOCKEVENT_RATING, +}; + +/* + * if user does not want to use per CPU apb timer, just give it a lower rating + * than local apic timer and skip the late per cpu timer init. + */ +static inline int __init setup_x86_mrst_timer(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp("apbt_only", arg) == 0) + disable_apbt_percpu = 0; + else if (strcmp("lapic_and_apbt", arg) == 0) + disable_apbt_percpu = 1; + else { + pr_warning("X86 MRST timer option %s not recognised" + " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", + arg); + return -EINVAL; + } + return 0; +} +__setup("x86_mrst_timer=", setup_x86_mrst_timer); + +/* + * start count down from 0xffff_ffff. this is done by toggling the enable bit + * then load initial load count to ~0. + */ +static void apbt_start_counter(int n) +{ + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT); + /* enable, mask interrupt */ + ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT); + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + /* read it once to get cached counter value initialized */ + apbt_read_clocksource(&clocksource_apbt); +} + +static irqreturn_t apbt_interrupt_handler(int irq, void *data) +{ + struct apbt_dev *dev = (struct apbt_dev *)data; + struct clock_event_device *aevt = &dev->evt; + + if (!aevt->event_handler) { + printk(KERN_INFO "Spurious APBT timer interrupt on %d\n", + dev->num); + return IRQ_NONE; + } + aevt->event_handler(aevt); + return IRQ_HANDLED; +} + +static void apbt_restart_clocksource(struct clocksource *cs) +{ + apbt_start_counter(phy_cs_timer_id); +} + +/* Setup IRQ routing via IOAPIC */ +#ifdef CONFIG_SMP +static void apbt_setup_irq(struct apbt_dev *adev) +{ + struct irq_chip *chip; + struct irq_desc *desc; + + /* timer0 irq has been setup early */ + if (adev->irq == 0) + return; + desc = irq_to_desc(adev->irq); + chip = get_irq_chip(adev->irq); + disable_irq(adev->irq); + desc->status |= IRQ_MOVE_PCNTXT; + irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); + /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */ + set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge"); + enable_irq(adev->irq); + if (system_state == SYSTEM_BOOTING) + if (request_irq(adev->irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + adev->name, adev)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + adev->num); + } +} +#endif + +static void apbt_enable_int(int n) +{ + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + /* clear pending intr */ + apbt_readl(n, APBTMR_N_EOI); + ctrl &= ~APBTMR_CONTROL_INT; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); +} + +static void apbt_disable_int(int n) +{ + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + + ctrl |= APBTMR_CONTROL_INT; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); +} + + +static int __init apbt_clockevent_register(void) +{ + struct sfi_timer_table_entry *mtmr; + struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev); + + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return -ENODEV; + } + + /* + * We need to calculate the scaled math multiplication factor for + * nanosecond to apbt tick conversion. + * mult = (nsec/cycle)*2^APBT_SHIFT + */ + apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz + , NSEC_PER_SEC, APBT_SHIFT); + + /* Calculate the min / max delta */ + apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, + &apbt_clockevent); + apbt_clockevent.min_delta_ns = clockevent_delta2ns( + APBT_MIN_DELTA_USEC*apbt_freq, + &apbt_clockevent); + /* + * Start apbt with the boot cpu mask and make it + * global if not used for per cpu timer. + */ + apbt_clockevent.cpumask = cpumask_of(smp_processor_id()); + adev->num = smp_processor_id(); + memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); + + if (disable_apbt_percpu) { + apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; + global_clock_event = &adev->evt; + printk(KERN_DEBUG "%s clockevent registered as global\n", + global_clock_event->name); + } + + if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + apbt_clockevent.name, adev)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + apbt_clockevent.irq); + } + + clockevents_register_device(&adev->evt); + /* Start APBT 0 interrupts */ + apbt_enable_int(APBT_CLOCKEVENT0_NUM); + + sfi_free_mtmr(mtmr); + return 0; +} + +#ifdef CONFIG_SMP +/* Should be called with per cpu */ +void apbt_setup_secondary_clock(void) +{ + struct apbt_dev *adev; + struct clock_event_device *aevt; + int cpu; + + /* Don't register boot CPU clockevent */ + cpu = smp_processor_id(); + if (cpu == boot_cpu_id) + return; + /* + * We need to calculate the scaled math multiplication factor for + * nanosecond to apbt tick conversion. + * mult = (nsec/cycle)*2^APBT_SHIFT + */ + printk(KERN_INFO "Init per CPU clockevent %d\n", cpu); + adev = &per_cpu(cpu_apbt_dev, cpu); + aevt = &adev->evt; + + memcpy(aevt, &apbt_clockevent, sizeof(*aevt)); + aevt->cpumask = cpumask_of(cpu); + aevt->name = adev->name; + aevt->mode = CLOCK_EVT_MODE_UNUSED; + + printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n", + cpu, aevt->name, *(u32 *)aevt->cpumask); + + apbt_setup_irq(adev); + + clockevents_register_device(aevt); + + apbt_enable_int(cpu); + + return; +} + +/* + * this notify handler process CPU hotplug events. in case of S0i3, nonboot + * cpus are disabled/enabled frequently, for performance reasons, we keep the + * per cpu timer irq registered so that we do need to do free_irq/request_irq. + * + * TODO: it might be more reliable to directly disable percpu clockevent device + * without the notifier chain. currently, cpu 0 may get interrupts from other + * cpu timers during the offline process due to the ordering of notification. + * the extra interrupt is harmless. + */ +static int apbt_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + unsigned long cpu = (unsigned long)hcpu; + struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); + + switch (action & 0xf) { + case CPU_DEAD: + apbt_disable_int(cpu); + if (system_state == SYSTEM_RUNNING) + pr_debug("skipping APBT CPU %lu offline\n", cpu); + else if (adev) { + pr_debug("APBT clockevent for cpu %lu offline\n", cpu); + free_irq(adev->irq, adev); + } + break; + default: + pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); + } + return NOTIFY_OK; +} + +static __init int apbt_late_init(void) +{ + if (disable_apbt_percpu) + return 0; + /* This notifier should be called after workqueue is ready */ + hotcpu_notifier(apbt_cpuhp_notify, -20); + return 0; +} +fs_initcall(apbt_late_init); +#else + +void apbt_setup_secondary_clock(void) {} + +#endif /* CONFIG_SMP */ + +static void apbt_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long ctrl; + uint64_t delta; + int timer_num; + struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + + timer_num = adev->num; + pr_debug("%s CPU %d timer %d mode=%d\n", + __func__, first_cpu(*evt->cpumask), timer_num, mode); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult; + delta >>= apbt_clockevent.shift; + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl |= APBTMR_CONTROL_MODE_PERIODIC; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* + * DW APB p. 46, have to disable timer before load counter, + * may cause sync problem. + */ + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + udelay(1); + pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ); + apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + /* APB timer does not have one-shot mode, use free running mode */ + case CLOCK_EVT_MODE_ONESHOT: + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + /* + * set free running mode, this mode will let timer reload max + * timeout which will give time (3min on 25MHz clock) to rearm + * the next event, therefore emulate the one-shot mode. + */ + ctrl &= ~APBTMR_CONTROL_ENABLE; + ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* write again to set free running mode */ + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + + /* + * DW APB p. 46, load counter with all 1s before starting free + * running mode. + */ + apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT); + ctrl &= ~APBTMR_CONTROL_INT; + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + apbt_disable_int(timer_num); + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + + case CLOCK_EVT_MODE_RESUME: + apbt_enable_int(timer_num); + break; + } +} + +static int apbt_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + unsigned long ctrl; + int timer_num; + + struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + + timer_num = adev->num; + /* Disable timer */ + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* write new count */ + apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + return 0; +} + +/* + * APB timer clock is not in sync with pclk on Langwell, which translates to + * unreliable read value caused by sampling error. the error does not add up + * overtime and only happens when sampling a 0 as a 1 by mistake. so the time + * would go backwards. the following code is trying to prevent time traveling + * backwards. little bit paranoid. + */ +static cycle_t apbt_read_clocksource(struct clocksource *cs) +{ + unsigned long t0, t1, t2; + static unsigned long last_read; + +bad_count: + t1 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + t2 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + if (unlikely(t1 < t2)) { + pr_debug("APBT: read current count error %lx:%lx:%lx\n", + t1, t2, t2 - t1); + goto bad_count; + } + /* + * check against cached last read, makes sure time does not go back. + * it could be a normal rollover but we will do tripple check anyway + */ + if (unlikely(t2 > last_read)) { + /* check if we have a normal rollover */ + unsigned long raw_intr_status = + apbt_readl_reg(APBTMRS_RAW_INT_STATUS); + /* + * cs timer interrupt is masked but raw intr bit is set if + * rollover occurs. then we read EOI reg to clear it. + */ + if (raw_intr_status & (1 << phy_cs_timer_id)) { + apbt_readl(phy_cs_timer_id, APBTMR_N_EOI); + goto out; + } + pr_debug("APB CS going back %lx:%lx:%lx ", + t2, last_read, t2 - last_read); +bad_count_x3: + pr_debug(KERN_INFO "tripple check enforced\n"); + t0 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + udelay(1); + t1 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + udelay(1); + t2 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + if ((t2 > t1) || (t1 > t0)) { + printk(KERN_ERR "Error: APB CS tripple check failed\n"); + goto bad_count_x3; + } + } +out: + last_read = t2; + return (cycle_t)~t2; +} + +static int apbt_clocksource_register(void) +{ + u64 start, now; + cycle_t t1; + + /* Start the counter, use timer 2 as source, timer 0/1 for event */ + apbt_start_counter(phy_cs_timer_id); + + /* Verify whether apbt counter works */ + t1 = apbt_read_clocksource(&clocksource_apbt); + rdtscll(start); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + rdtscll(now); + } while ((now - start) < 200000UL); + + /* APBT is the only always on clocksource, it has to work! */ + if (t1 == apbt_read_clocksource(&clocksource_apbt)) + panic("APBT counter not counting. APBT disabled\n"); + + /* + * initialize and register APBT clocksource + * convert that to ns/clock cycle + * mult = (ns/c) * 2^APBT_SHIFT + */ + clocksource_apbt.mult = div_sc(MSEC_PER_SEC, + (unsigned long) apbt_freq, APBT_SHIFT); + clocksource_register(&clocksource_apbt); + + return 0; +} + +/* + * Early setup the APBT timer, only use timer 0 for booting then switch to + * per CPU timer if possible. + * returns 1 if per cpu apbt is setup + * returns 0 if no per cpu apbt is chosen + * panic if set up failed, this is the only platform timer on Moorestown. + */ +void __init apbt_time_init(void) +{ +#ifdef CONFIG_SMP + int i; + struct sfi_timer_table_entry *p_mtmr; + unsigned int percpu_timer; + struct apbt_dev *adev; +#endif + + if (apb_timer_block_enabled) + return; + apbt_set_mapping(); + if (apbt_virt_address) { + pr_debug("Found APBT version 0x%lx\n",\ + apbt_readl_reg(APBTMRS_COMP_VERSION)); + } else + goto out_noapbt; + /* + * Read the frequency and check for a sane value, for ESL model + * we extend the possible clock range to allow time scaling. + */ + + if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { + pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq); + goto out_noapbt; + } + if (apbt_clocksource_register()) { + pr_debug("APBT has failed to register clocksource\n"); + goto out_noapbt; + } + if (!apbt_clockevent_register()) + apb_timer_block_enabled = 1; + else { + pr_debug("APBT has failed to register clockevent\n"); + goto out_noapbt; + } +#ifdef CONFIG_SMP + /* kernel cmdline disable apb timer, so we will use lapic timers */ + if (disable_apbt_percpu) { + printk(KERN_INFO "apbt: disabled per cpu timer\n"); + return; + } + pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); + if (num_possible_cpus() <= sfi_mtimer_num) { + percpu_timer = 1; + apbt_num_timers_used = num_possible_cpus(); + } else { + percpu_timer = 0; + apbt_num_timers_used = 1; + adev = &per_cpu(cpu_apbt_dev, 0); + adev->flags &= ~APBT_DEV_USED; + } + pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); + + /* here we set up per CPU timer data structure */ + apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used, + GFP_KERNEL); + if (!apbt_devs) { + printk(KERN_ERR "Failed to allocate APB timer devices\n"); + return; + } + for (i = 0; i < apbt_num_timers_used; i++) { + adev = &per_cpu(cpu_apbt_dev, i); + adev->num = i; + adev->cpu = i; + p_mtmr = sfi_get_mtmr(i); + if (p_mtmr) { + adev->tick = p_mtmr->freq_hz; + adev->irq = p_mtmr->irq; + } else + printk(KERN_ERR "Failed to get timer for cpu %d\n", i); + adev->count = 0; + sprintf(adev->name, "apbt%d", i); + } +#endif + + return; + +out_noapbt: + apbt_clear_mapping(); + apb_timer_block_enabled = 0; + panic("failed to enable APB timer\n"); +} + +static inline void apbt_disable(int n) +{ + if (is_apbt_capable()) { + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + } +} + +/* called before apb_timer_enable, use early map */ +unsigned long apbt_quick_calibrate() +{ + int i, scale; + u64 old, new; + cycle_t t1, t2; + unsigned long khz = 0; + u32 loop, shift; + + apbt_set_mapping(); + apbt_start_counter(phy_cs_timer_id); + + /* check if the timer can count down, otherwise return */ + old = apbt_read_clocksource(&clocksource_apbt); + i = 10000; + while (--i) { + if (old != apbt_read_clocksource(&clocksource_apbt)) + break; + } + if (!i) + goto failed; + + /* count 16 ms */ + loop = (apbt_freq * 1000) << 4; + + /* restart the timer to ensure it won't get to 0 in the calibration */ + apbt_start_counter(phy_cs_timer_id); + + old = apbt_read_clocksource(&clocksource_apbt); + old += loop; + + t1 = __native_read_tsc(); + + do { + new = apbt_read_clocksource(&clocksource_apbt); + } while (new < old); + + t2 = __native_read_tsc(); + + shift = 5; + if (unlikely(loop >> shift == 0)) { + printk(KERN_INFO + "APBT TSC calibration failed, not enough resolution\n"); + return 0; + } + scale = (int)div_u64((t2 - t1), loop >> shift); + khz = (scale * apbt_freq * 1000) >> shift; + printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); + return khz; +failed: + return 0; +} diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 3704997e8b2..b5d8b0bcf23 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -393,6 +393,7 @@ void __init gart_iommu_hole_init(void) for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { int bus; int dev_base, dev_limit; + u32 ctl; bus = bus_dev_ranges[i].bus; dev_base = bus_dev_ranges[i].dev_base; @@ -406,7 +407,19 @@ void __init gart_iommu_hole_init(void) gart_iommu_aperture = 1; x86_init.iommu.iommu_init = gart_iommu_init; - aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; + ctl = read_pci_config(bus, slot, 3, + AMD64_GARTAPERTURECTL); + + /* + * Before we do anything else disable the GART. It may + * still be enabled if we boot into a crash-kernel here. + * Reconfiguring the GART while it is enabled could have + * unknown side-effects. + */ + ctl &= ~GARTEN; + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); + + aper_order = (ctl >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; aper_base <<= 25; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index aa57c079c98..e5a4a1e0161 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -61,12 +61,6 @@ unsigned int boot_cpu_physical_apicid = -1U; /* * The highest APIC ID seen during enumeration. - * - * On AMD, this determines the messaging protocol we can use: if all APIC IDs - * are in the 0 ... 7 range, then we can use logical addressing which - * has some performance advantages (better broadcasting). - * - * If there's an APIC ID above 8, we use physical addressing. */ unsigned int max_physical_apicid; @@ -587,7 +581,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) res = (((u64)(*deltatsc)) * pm_100ms); do_div(res, deltapm); apic_printk(APIC_VERBOSE, "TSC delta adjusted to " - "PM-Timer: %lu (%ld) \n", + "PM-Timer: %lu (%ld)\n", (unsigned long)res, *deltatsc); *deltatsc = (long)res; } @@ -1396,7 +1390,7 @@ void __init enable_IR_x2apic(void) } local_irq_save(flags); - mask_8259A(); + legacy_pic->mask_all(); mask_IO_APIC_setup(ioapic_entries); if (dmar_table_init_ret) @@ -1428,7 +1422,7 @@ void __init enable_IR_x2apic(void) nox2apic: if (!ret) /* IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - unmask_8259A(); + legacy_pic->restore_mask(); local_irq_restore(flags); out: @@ -1646,8 +1640,8 @@ int __init APIC_init_uniprocessor(void) } #endif +#ifndef CONFIG_SMP enable_IR_x2apic(); -#ifdef CONFIG_X86_64 default_setup_apic_routing(); #endif @@ -1897,18 +1891,6 @@ void __cpuinit generic_processor_info(int apicid, int version) if (apicid > max_physical_apicid) max_physical_apicid = apicid; -#ifdef CONFIG_X86_32 - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (num_processors > 8) - def_to_bigsmp = 1; - break; - case X86_VENDOR_AMD: - if (max_physical_apicid >= 8) - def_to_bigsmp = 1; - } -#endif - #if defined(CONFIG_SMP) || defined(CONFIG_X86_64) early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; @@ -2038,7 +2020,7 @@ static int lapic_resume(struct sys_device *dev) } mask_IO_APIC_setup(ioapic_entries); - mask_8259A(); + legacy_pic->mask_all(); } if (x2apic_mode) @@ -2082,7 +2064,7 @@ static int lapic_resume(struct sys_device *dev) if (intr_remapping_enabled) { reenable_intr_remapping(x2apic_mode); - unmask_8259A(); + legacy_pic->restore_mask(); restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); } diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index eacbd2b31d2..09d3b17ce0c 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -223,7 +223,7 @@ struct apic apic_flat = { }; /* - * Physflat mode is used when there are more than 8 CPUs on a AMD system. + * Physflat mode is used when there are more than 8 CPUs on a system. * We cannot use logical delivery in this case because the mask * overflows, so use physical mode. */ @@ -240,6 +240,11 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) printk(KERN_DEBUG "system APIC only can use physical flat"); return 1; } + + if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) { + printk(KERN_DEBUG "IBM Summit detected, will use apic physical"); + return 1; + } #endif return 0; diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index dd2b5f26464..03ba1b895f5 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -42,6 +42,7 @@ #include <linux/errno.h> #include <linux/acpi.h> #include <linux/init.h> +#include <linux/gfp.h> #include <linux/nmi.h> #include <linux/smp.h> #include <linux/io.h> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index de00c4619a5..127b8718abf 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -36,6 +36,7 @@ #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/jiffies.h> /* time_after() */ +#include <linux/slab.h> #ifdef CONFIG_ACPI #include <acpi/acpi_bus.h> #endif @@ -73,8 +74,8 @@ */ int sis_apic_bug = -1; -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -94,8 +95,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; -/* Number of legacy interrupts */ -static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; /* GSI interrupts */ static int nr_irqs_gsi = NR_IRQS_LEGACY; @@ -140,33 +139,10 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ #ifdef CONFIG_SPARSE_IRQ -static struct irq_cfg irq_cfgx[] = { +static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; #else -static struct irq_cfg irq_cfgx[NR_IRQS] = { +static struct irq_cfg irq_cfgx[NR_IRQS]; #endif - [0] = { .vector = IRQ0_VECTOR, }, - [1] = { .vector = IRQ1_VECTOR, }, - [2] = { .vector = IRQ2_VECTOR, }, - [3] = { .vector = IRQ3_VECTOR, }, - [4] = { .vector = IRQ4_VECTOR, }, - [5] = { .vector = IRQ5_VECTOR, }, - [6] = { .vector = IRQ6_VECTOR, }, - [7] = { .vector = IRQ7_VECTOR, }, - [8] = { .vector = IRQ8_VECTOR, }, - [9] = { .vector = IRQ9_VECTOR, }, - [10] = { .vector = IRQ10_VECTOR, }, - [11] = { .vector = IRQ11_VECTOR, }, - [12] = { .vector = IRQ12_VECTOR, }, - [13] = { .vector = IRQ13_VECTOR, }, - [14] = { .vector = IRQ14_VECTOR, }, - [15] = { .vector = IRQ15_VECTOR, }, -}; - -void __init io_apic_disable_legacy(void) -{ - nr_legacy_irqs = 0; - nr_irqs_gsi = 0; -} int __init arch_early_irq_init(void) { @@ -176,6 +152,11 @@ int __init arch_early_irq_init(void) int node; int i; + if (!legacy_pic->nr_legacy_irqs) { + nr_irqs_gsi = 0; + io_apic_irqs = ~0UL; + } + cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); node= cpu_to_node(boot_cpu_id); @@ -185,8 +166,14 @@ int __init arch_early_irq_init(void) desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); - if (i < nr_legacy_irqs) - cpumask_setall(cfg[i].domain); + /* + * For legacy IRQ's, start with assigning irq0 to irq15 to + * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. + */ + if (i < legacy_pic->nr_legacy_irqs) { + cfg[i].vector = IRQ0_VECTOR + i; + cpumask_set_cpu(0, cfg[i].domain); + } } return 0; @@ -406,7 +393,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) struct irq_pin_list *entry; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; int pin; @@ -415,11 +402,11 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ if (reg & IO_APIC_REDIR_REMOTE_IRR) { - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return true; } } - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return false; } @@ -433,10 +420,10 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { union entry_union eu; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return eu.entry; } @@ -459,9 +446,9 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __ioapic_write_entry(apic, pin, e); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -474,10 +461,10 @@ static void ioapic_mask_entry(int apic, int pin) unsigned long flags; union entry_union eu = { .entry.mask = 1 }; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x10 + 2*pin, eu.w1); io_apic_write(apic, 0x11 + 2*pin, eu.w2); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -604,9 +591,9 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc) BUG_ON(!cfg); - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __mask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) @@ -614,9 +601,9 @@ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) struct irq_cfg *cfg = desc->chip_data; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __unmask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void mask_IO_APIC_irq(unsigned int irq) @@ -865,7 +852,7 @@ static int __init find_isa_irq_apic(int irq, int type) */ static int EISA_ELCR(unsigned int irq) { - if (irq < nr_legacy_irqs) { + if (irq < legacy_pic->nr_legacy_irqs) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1140,12 +1127,12 @@ void lock_vector_lock(void) /* Used to the online set of cpus does not change * during assign_irq_vector. */ - spin_lock(&vector_lock); + raw_spin_lock(&vector_lock); } void unlock_vector_lock(void) { - spin_unlock(&vector_lock); + raw_spin_unlock(&vector_lock); } static int @@ -1162,7 +1149,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; + static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; + static int current_offset = VECTOR_OFFSET_START % 8; unsigned int old_vector; int cpu, err; cpumask_var_t tmp_mask; @@ -1198,7 +1186,7 @@ next: if (vector >= first_system_vector) { /* If out of vectors on large boxen, must share them. */ offset = (offset + 1) % 8; - vector = FIRST_DEVICE_VECTOR + offset; + vector = FIRST_EXTERNAL_VECTOR + offset; } if (unlikely(current_vector == vector)) continue; @@ -1232,9 +1220,9 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) int err; unsigned long flags; - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); err = __assign_irq_vector(irq, cfg, mask); - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); return err; } @@ -1268,14 +1256,27 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ - /* This function must be called with vector_lock held */ int irq, vector; struct irq_cfg *cfg; struct irq_desc *desc; + /* + * vector_lock will make sure that we don't run into irq vector + * assignments that might be happening on another cpu in parallel, + * while we setup our initial vector to irq mappings. + */ + raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { cfg = desc->chip_data; + + /* + * If it is a legacy IRQ handled by the legacy PIC, this cpu + * will be part of the irq_cfg's domain. + */ + if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq)) + cpumask_set_cpu(cpu, cfg->domain); + if (!cpumask_test_cpu(cpu, cfg->domain)) continue; vector = cfg->vector; @@ -1291,6 +1292,7 @@ void __setup_vector_irq(int cpu) if (!cpumask_test_cpu(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } + raw_spin_unlock(&vector_lock); } static struct irq_chip ioapic_chip; @@ -1440,6 +1442,14 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq cfg = desc->chip_data; + /* + * For legacy irqs, cfg->domain starts with cpu 0 for legacy + * controllers like 8259. Now that IO-APIC can handle this irq, update + * the cfg->domain. + */ + if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) + apic->vector_allocation_domain(0, cfg->domain); + if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; @@ -1461,8 +1471,8 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq } ioapic_register_intr(irq, desc, trigger); - if (irq < nr_legacy_irqs) - disable_8259A_irq(irq); + if (irq < legacy_pic->nr_legacy_irqs) + legacy_pic->chip->mask(irq); ioapic_write_entry(apic_id, pin, entry); } @@ -1473,7 +1483,7 @@ static struct { static void __init setup_IO_APIC_irqs(void) { - int apic_id = 0, pin, idx, irq; + int apic_id, pin, idx, irq; int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; @@ -1481,14 +1491,7 @@ static void __init setup_IO_APIC_irqs(void) apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); -#ifdef CONFIG_ACPI - if (!acpi_disabled && acpi_ioapic) { - apic_id = mp_find_ioapic(0); - if (apic_id < 0) - apic_id = 0; - } -#endif - + for (apic_id = 0; apic_id < nr_ioapics; apic_id++) for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { idx = find_irq_entry(apic_id, pin, mp_INT); if (idx == -1) { @@ -1510,6 +1513,9 @@ static void __init setup_IO_APIC_irqs(void) irq = pin_2_irq(idx, apic_id, pin); + if ((apic_id > 0) && (irq > 16)) + continue; + /* * Skip the timer IRQ if there's a quirk handler * installed and if it returns 1: @@ -1539,6 +1545,56 @@ static void __init setup_IO_APIC_irqs(void) } /* + * for the gsit that is not in first ioapic + * but could not use acpi_register_gsi() + * like some special sci in IBM x3330 + */ +void setup_IO_APIC_irq_extra(u32 gsi) +{ + int apic_id = 0, pin, idx, irq; + int node = cpu_to_node(boot_cpu_id); + struct irq_desc *desc; + struct irq_cfg *cfg; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + apic_id = mp_find_ioapic(gsi); + if (apic_id < 0) + return; + + pin = mp_find_ioapic_pin(apic_id, gsi); + idx = find_irq_entry(apic_id, pin, mp_INT); + if (idx == -1) + return; + + irq = pin_2_irq(idx, apic_id, pin); +#ifdef CONFIG_SPARSE_IRQ + desc = irq_to_desc(irq); + if (desc) + return; +#endif + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + return; + } + + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, apic_id, pin); + + if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[apic_id].apicid, pin); + return; + } + set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); + + setup_IO_APIC_irq(apic_id, pin, irq, desc, + irq_trigger(idx), irq_polarity(idx)); +} + +/* * Set up the timer pin, possibly with the 8259A-master behind. */ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, @@ -1601,14 +1657,14 @@ __apicdebuginit(void) print_IO_APIC(void) for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic, 0); reg_01.raw = io_apic_read(apic, 1); if (reg_01.bits.version >= 0x10) reg_02.raw = io_apic_read(apic, 2); if (reg_01.bits.version >= 0x20) reg_03.raw = io_apic_read(apic, 3); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); @@ -1647,7 +1703,7 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG ".... IRQ redirection table:\n"); printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" - " Stat Dmod Deli Vect: \n"); + " Stat Dmod Deli Vect:\n"); for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; @@ -1825,12 +1881,12 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); v = inb(0xa1) << 8 | inb(0x21); printk(KERN_DEBUG "... PIC IMR: %04x\n", v); @@ -1844,7 +1900,7 @@ __apicdebuginit(void) print_PIC(void) outb(0x0a,0xa0); outb(0x0a,0x20); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); printk(KERN_DEBUG "... PIC ISR: %04x\n", v); @@ -1903,13 +1959,13 @@ void __init enable_IO_APIC(void) * The number of IO-APIC IRQ registers (== #pins): */ for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); nr_ioapic_registers[apic] = reg_01.bits.entries+1; } - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; for(apic = 0; apic < nr_ioapics; apic++) { @@ -1966,7 +2022,7 @@ void disable_IO_APIC(void) */ clear_IO_APIC(); - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; /* @@ -2045,9 +2101,9 @@ void __init setup_ioapic_ids_from_mpc(void) for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic_id, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); old_id = mp_ioapics[apic_id].apicid; @@ -2106,16 +2162,16 @@ void __init setup_ioapic_ids_from_mpc(void) mp_ioapics[apic_id].apicid); reg_00.bits.ID = mp_ioapics[apic_id].apicid; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic_id, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); /* * Sanity check */ - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic_id, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) printk("could not set ID!\n"); else @@ -2198,15 +2254,15 @@ static unsigned int startup_ioapic_irq(unsigned int irq) unsigned long flags; struct irq_cfg *cfg; - spin_lock_irqsave(&ioapic_lock, flags); - if (irq < nr_legacy_irqs) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) + raw_spin_lock_irqsave(&ioapic_lock, flags); + if (irq < legacy_pic->nr_legacy_irqs) { + legacy_pic->chip->mask(irq); + if (legacy_pic->irq_pending(irq)) was_pending = 1; } cfg = irq_cfg(irq); __unmask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; } @@ -2217,9 +2273,9 @@ static int ioapic_retrigger_irq(unsigned int irq) struct irq_cfg *cfg = irq_cfg(irq); unsigned long flags; - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; } @@ -2312,14 +2368,14 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) irq = desc->irq; cfg = desc->chip_data; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); ret = set_desc_affinity(desc, mask, &dest); if (!ret) { /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); __target_IO_APIC_irq(irq, dest, cfg); } - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return ret; } @@ -2434,6 +2490,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) cfg = irq_cfg(irq); raw_spin_lock(&desc->lock); + /* + * Check if the irq migration is in progress. If so, we + * haven't received the cleanup request yet for this irq. + */ + if (cfg->move_in_progress) + goto unlock; + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; @@ -2547,9 +2610,9 @@ static void eoi_ioapic_irq(struct irq_desc *desc) irq = desc->irq; cfg = desc->chip_data; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void ack_apic_level(unsigned int irq) @@ -2727,8 +2790,8 @@ static inline void init_IO_APIC_traps(void) * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < nr_legacy_irqs) - make_8259A_irq(irq); + if (irq < legacy_pic->nr_legacy_irqs) + legacy_pic->make_irq(irq); else /* Strange. Oh, well.. */ desc->chip = &no_irq_chip; @@ -2885,7 +2948,7 @@ static inline void __init check_timer(void) /* * get/set the timer IRQ vector: */ - disable_8259A_irq(0); + legacy_pic->chip->mask(0); assign_irq_vector(0, cfg, apic->target_cpus()); /* @@ -2898,7 +2961,7 @@ static inline void __init check_timer(void) * automatically. */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); + legacy_pic->init(1); #ifdef CONFIG_X86_32 { unsigned int ver; @@ -2957,7 +3020,7 @@ static inline void __init check_timer(void) if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -2980,14 +3043,14 @@ static inline void __init check_timer(void) */ replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); + legacy_pic->chip->mask(0); setup_nmi(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } goto out; } @@ -2995,7 +3058,7 @@ static inline void __init check_timer(void) * Cleanup, just in case ... */ local_irq_disable(); - disable_8259A_irq(0); + legacy_pic->chip->mask(0); clear_IO_APIC_pin(apic2, pin2); apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } @@ -3014,22 +3077,22 @@ static inline void __init check_timer(void) lapic_register_intr(0, desc); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } local_irq_disable(); - disable_8259A_irq(0); + legacy_pic->chip->mask(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as ExtINT IRQ...\n"); - init_8259A(0); - make_8259A_irq(0); + legacy_pic->init(0); + legacy_pic->make_irq(0); apic_write(APIC_LVT0, APIC_DM_EXTINT); unlock_ExtINT_logic(); @@ -3071,7 +3134,7 @@ void __init setup_IO_APIC(void) /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ - io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; + io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); /* @@ -3082,7 +3145,7 @@ void __init setup_IO_APIC(void) sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); - if (nr_legacy_irqs) + if (legacy_pic->nr_legacy_irqs) check_timer(); } @@ -3131,13 +3194,13 @@ static int ioapic_resume(struct sys_device *dev) data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(dev->id, 0); if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { reg_00.bits.ID = mp_ioapics[dev->id].apicid; io_apic_write(dev->id, 0, reg_00.raw); } - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); for (i = 0; i < nr_ioapic_registers[dev->id]; i++) ioapic_write_entry(dev->id, i, entry[i]); @@ -3200,7 +3263,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) if (irq_want < nr_irqs_gsi) irq_want = nr_irqs_gsi; - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); for (new = irq_want; new < nr_irqs; new++) { desc_new = irq_to_desc_alloc_node(new, node); if (!desc_new) { @@ -3219,14 +3282,11 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) irq = new; break; } - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); + + if (irq > 0) + dynamic_irq_init_keep_chip_data(irq); - if (irq > 0) { - dynamic_irq_init(irq); - /* restore it, in case dynamic_irq_init clear it */ - if (desc_new) - desc_new->chip_data = cfg_new; - } return irq; } @@ -3248,20 +3308,13 @@ int create_irq(void) void destroy_irq(unsigned int irq) { unsigned long flags; - struct irq_cfg *cfg; - struct irq_desc *desc; - /* store it, in case dynamic_irq_cleanup clear it */ - desc = irq_to_desc(irq); - cfg = desc->chip_data; - dynamic_irq_cleanup(irq); - /* connect back irq_cfg */ - desc->chip_data = cfg; + dynamic_irq_cleanup_keep_chip_data(irq); free_irte(irq); - spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq, cfg); - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); + __clear_irq_vector(irq, get_irq_chip_data(irq)); + raw_spin_unlock_irqrestore(&vector_lock, flags); } /* @@ -3798,9 +3851,9 @@ int __init io_apic_get_redir_entries (int ioapic) union IO_APIC_reg_01 reg_01; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.entries; } @@ -3883,7 +3936,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= nr_legacy_irqs) { + if (irq >= legacy_pic->nr_legacy_irqs) { cfg = desc->chip_data; if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { printk(KERN_INFO "can not add pin %d for irq %d\n", @@ -3962,9 +4015,9 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) if (physids_empty(apic_id_map)) apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); if (apic_id >= get_physical_broadcast()) { printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " @@ -3998,10 +4051,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) if (reg_00.bits.ID != apic_id) { reg_00.bits.ID = apic_id; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(ioapic, 0, reg_00.raw); reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); /* Sanity check */ if (reg_00.bits.ID != apic_id) { @@ -4022,9 +4075,9 @@ int __init io_apic_get_version(int ioapic) union IO_APIC_reg_01 reg_01; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.version; } @@ -4056,27 +4109,23 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) #ifdef CONFIG_SMP void __init setup_ioapic_dest(void) { - int pin, ioapic = 0, irq, irq_entry; + int pin, ioapic, irq, irq_entry; struct irq_desc *desc; const struct cpumask *mask; if (skip_ioapic_setup == 1) return; -#ifdef CONFIG_ACPI - if (!acpi_disabled && acpi_ioapic) { - ioapic = mp_find_ioapic(0); - if (ioapic < 0) - ioapic = 0; - } -#endif - + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { irq_entry = find_irq_entry(ioapic, pin, mp_INT); if (irq_entry == -1) continue; irq = pin_2_irq(irq_entry, ioapic, pin); + if ((ioapic > 0) && (irq > 16)) + continue; + desc = irq_to_desc(irq); /* @@ -4261,3 +4310,24 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) nr_ioapics++; } + +/* Enable IOAPIC early just for system timer */ +void __init pre_init_apic_IRQ0(void) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + + printk(KERN_INFO "Early APIC setup for system timer0\n"); +#ifndef CONFIG_SMP + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); +#endif + desc = irq_to_desc_alloc_node(0, 0); + + setup_local_APIC(); + + cfg = irq_cfg(0); + add_pin_to_irq_node(cfg, 0, 0, 0); + set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); + + setup_IO_APIC_irq(0, 0, 0, desc, 0, 0); +} diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 0159a69396c..1edaf15c0b8 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -18,6 +18,7 @@ #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/sysdev.h> #include <linux/sysctl.h> #include <linux/percpu.h> @@ -177,7 +178,7 @@ int __init check_nmi_watchdog(void) error: if (nmi_watchdog == NMI_IO_APIC) { if (!timer_through_8259) - disable_8259A_irq(0); + legacy_pic->chip->mask(0); on_each_cpu(__acpi_nmi_disable, NULL, 1); } @@ -416,13 +417,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) /* We can be called before check_nmi_watchdog, hence NULL check. */ if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */ - spin_lock(&lock); + raw_spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); show_regs(regs); dump_stack(); - spin_unlock(&lock); + raw_spin_unlock(&lock); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); rc = 1; @@ -438,8 +439,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ - __this_cpu_inc(per_cpu_var(alert_counter)); - if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz) + __this_cpu_inc(alert_counter); + if (__this_cpu_read(alert_counter) == 5 * nmi_hz) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ @@ -447,7 +448,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) regs, panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; - __this_cpu_write(per_cpu_var(alert_counter), 0); + __this_cpu_write(alert_counter, 0); } /* see if the nmi watchdog went off */ diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 98c4665f251..3e28401f161 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -225,7 +225,7 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) mpc_record = 0; printk(KERN_INFO - "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); + "Found an OEM MPC table at %8p - parsing it...\n", oemtable); if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { printk(KERN_WARNING @@ -277,6 +277,7 @@ static __init void early_check_numaq(void) x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; x86_init.timers.tsc_pre_init = numaq_tsc_init; + x86_init.pci.init = pci_numaq_init; } } diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 1a6559f6768..99d2fe01608 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -52,7 +52,32 @@ static int __init print_ipi_mode(void) } late_initcall(print_ipi_mode); -void default_setup_apic_routing(void) +void __init default_setup_apic_routing(void) +{ + int version = apic_version[boot_cpu_physical_apicid]; + + if (num_possible_cpus() > 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } + +#ifdef CONFIG_X86_BIGSMP + generic_bigsmp_probe(); +#endif + + if (apic->setup_apic_routing) + apic->setup_apic_routing(); +} + +static void setup_apic_flat_routing(void) { #ifdef CONFIG_X86_IO_APIC printk(KERN_INFO @@ -103,7 +128,7 @@ struct apic apic_default = { .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, - .setup_apic_routing = default_setup_apic_routing, + .setup_apic_routing = setup_apic_flat_routing, .multi_timer_check = NULL, .apicid_to_node = default_apicid_to_node, .cpu_to_logical_apicid = default_cpu_to_logical_apicid, diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index c4cbd3080c1..83e9be4778e 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -67,17 +67,8 @@ void __init default_setup_apic_routing(void) } #endif - if (apic == &apic_flat) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (num_processors > 8) - apic = &apic_physflat; - break; - case X86_VENDOR_AMD: - if (max_physical_apicid >= 8) - apic = &apic_physflat; - } - } + if (apic == &apic_flat && num_possible_cpus() > 8) + apic = &apic_physflat; printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d56b0efb205..c085d52dbaf 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,7 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * - * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved. */ #include <linux/cpumask.h> #include <linux/hardirq.h> @@ -17,9 +17,12 @@ #include <linux/ctype.h> #include <linux/sched.h> #include <linux/timer.h> +#include <linux/slab.h> #include <linux/cpu.h> #include <linux/init.h> #include <linux/io.h> +#include <linux/pci.h> +#include <linux/kdebug.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> @@ -34,8 +37,13 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); +#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args) + static enum uv_system_type uv_system_type; static u64 gru_start_paddr, gru_end_paddr; +int uv_min_hub_revision_id; +EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); +static DEFINE_SPINLOCK(uv_nmi_lock); static inline bool is_GRU_range(u64 start, u64 end) { @@ -55,20 +63,28 @@ static int early_get_nodeid(void) mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); node_id.v = *mmr; early_iounmap(mmr, sizeof(*mmr)); + + /* Currently, all blades have same revision number */ + uv_min_hub_revision_id = node_id.s.revision; + return node_id.s.node_id; } static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { + int nodeid; + if (!strcmp(oem_id, "SGI")) { + nodeid = early_get_nodeid(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; + x86_platform.nmi_init = uv_nmi_init; if (!strcmp(oem_table_id, "UVL")) uv_system_type = UV_LEGACY_APIC; else if (!strcmp(oem_table_id, "UVX")) uv_system_type = UV_X2APIC; else if (!strcmp(oem_table_id, "UVH")) { __get_cpu_var(x2apic_extra_bits) = - early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1); + nodeid << (UV_APIC_PNODE_SHIFT - 1); uv_system_type = UV_NON_UNIQUE_APIC; return 1; } @@ -105,11 +121,9 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - static const struct cpumask *uv_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) @@ -374,13 +388,13 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) enum map_type {map_wb, map_uc}; -static __init void map_high(char *id, unsigned long base, int shift, - int max_pnode, enum map_type map_type) +static __init void map_high(char *id, unsigned long base, int pshift, + int bshift, int max_pnode, enum map_type map_type) { unsigned long bytes, paddr; - paddr = base << shift; - bytes = (1UL << shift) * (max_pnode + 1); + paddr = base << pshift; + bytes = (1UL << bshift) * (max_pnode + 1); printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); if (map_type == map_uc) @@ -396,7 +410,7 @@ static __init void map_gru_high(int max_pnode) gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); if (gru.s.enable) { - map_high("GRU", gru.s.base, shift, max_pnode, map_wb); + map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb); gru_start_paddr = ((u64)gru.s.base << shift); gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); @@ -410,7 +424,7 @@ static __init void map_mmr_high(int max_pnode) mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); + map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc); } static __init void map_mmioh_high(int max_pnode) @@ -420,7 +434,8 @@ static __init void map_mmioh_high(int max_pnode) mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); if (mmioh.s.enable) - map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); + map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io, + max_pnode, map_uc); } static __init void map_low_mmrs(void) @@ -472,7 +487,7 @@ static void uv_heartbeat(unsigned long ignored) static void __cpuinit uv_heartbeat_enable(int cpu) { - if (!uv_cpu_hub_info(cpu)->scir.enabled) { + while (!uv_cpu_hub_info(cpu)->scir.enabled) { struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); @@ -480,11 +495,10 @@ static void __cpuinit uv_heartbeat_enable(int cpu) timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; add_timer_on(timer, cpu); uv_cpu_hub_info(cpu)->scir.enabled = 1; - } - /* check boot cpu */ - if (!uv_cpu_hub_info(0)->scir.enabled) - uv_heartbeat_enable(0); + /* also ensure that boot cpu is enabled */ + cpu = 0; + } } #ifdef CONFIG_HOTPLUG_CPU @@ -543,6 +557,30 @@ late_initcall(uv_init_heartbeat); #endif /* !CONFIG_HOTPLUG_CPU */ +/* Direct Legacy VGA I/O traffic to designated IOH */ +int uv_set_vga_state(struct pci_dev *pdev, bool decode, + unsigned int command_bits, bool change_bridge) +{ + int domain, bus, rc; + + PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n", + pdev->devfn, decode, command_bits, change_bridge); + + if (!change_bridge) + return 0; + + if ((command_bits & PCI_COMMAND_IO) == 0) + return 0; + + domain = pci_domain_nr(pdev->bus); + bus = pdev->bus->number; + + rc = uv_bios_set_legacy_vga_target(decode, domain, bus); + PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc); + + return rc; +} + /* * Called on each cpu to initialize the per_cpu UV data area. * FIXME: hotplug not supported yet @@ -559,6 +597,46 @@ void __cpuinit uv_cpu_init(void) set_x2apic_extra_bits(uv_hub_info->pnode); } +/* + * When NMI is received, print a stack trace. + */ +int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) +{ + if (reason != DIE_NMI_IPI) + return NOTIFY_OK; + /* + * Use a lock so only one cpu prints at a time + * to prevent intermixed output. + */ + spin_lock(&uv_nmi_lock); + pr_info("NMI stack dump cpu %u:\n", smp_processor_id()); + dump_stack(); + spin_unlock(&uv_nmi_lock); + + return NOTIFY_STOP; +} + +static struct notifier_block uv_dump_stack_nmi_nb = { + .notifier_call = uv_handle_nmi +}; + +void uv_register_nmi_notifier(void) +{ + if (register_die_notifier(&uv_dump_stack_nmi_nb)) + printk(KERN_WARNING "UV NMI handler failed to register\n"); +} + +void uv_nmi_init(void) +{ + unsigned int value; + + /* + * Unmask NMI on all cpus + */ + value = apic_read(APIC_LVT1) | APIC_DM_NMI; + value &= ~APIC_LVT_MASKED; + apic_write(APIC_LVT1, value); +} void __init uv_system_init(void) { @@ -624,13 +702,15 @@ void __init uv_system_init(void) } uv_bios_init(); - uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, - &sn_coherency_id, &sn_region_size); + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, + &sn_region_size, &system_serial_number); uv_rtc_init(); for_each_present_cpu(cpu) { + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + nid = cpu_to_node(cpu); - pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); + pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); lcpu = uv_blade_info[blade].nr_possible_cpus; uv_blade_info[blade].nr_possible_cpus++; @@ -651,15 +731,13 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; - uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; + uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); - printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " - "lcpu %d, blade %d\n", - cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, - lcpu, blade); + printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n", + cpu, apicid, pnode, nid, lcpu, blade); } /* Add blade/pnode info for nodes without cpus */ @@ -680,5 +758,9 @@ void __init uv_system_init(void) uv_cpu_init(); uv_scir_register_cpu_notifier(); + uv_register_nmi_notifier(); proc_mkdir("sgi_uv", NULL); + + /* register Legacy VGA I/O redirection handler */ + pci_register_set_vga_state(uv_set_vga_state); } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index b5b6b23bce5..031aa887b0e 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1992,8 +1992,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d) apm_info.disabled = 1; printk(KERN_INFO "%s machine detected. " "Disabling APM.\n", d->ident); - printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); - printk(KERN_INFO "download from support.intel.com \n"); + printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n"); + printk(KERN_INFO "download from support.intel.com\n"); } return 0; } diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index b0206a211b0..8bc57baaa9a 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -15,8 +15,8 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson + * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson <rja@sgi.com> */ #include <linux/efi.h> @@ -30,6 +30,7 @@ static struct uv_systab uv_systab; s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) { struct uv_systab *tab = &uv_systab; + s64 ret; if (!tab->function) /* @@ -37,9 +38,11 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) */ return BIOS_STATUS_UNIMPLEMENTED; - return efi_call6((void *)__va(tab->function), - (u64)which, a1, a2, a3, a4, a5); + ret = efi_call6((void *)__va(tab->function), (u64)which, + a1, a2, a3, a4, a5); + return ret; } +EXPORT_SYMBOL_GPL(uv_bios_call); s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) @@ -73,11 +76,14 @@ long sn_coherency_id; EXPORT_SYMBOL_GPL(sn_coherency_id); long sn_region_size; EXPORT_SYMBOL_GPL(sn_region_size); +long system_serial_number; +EXPORT_SYMBOL_GPL(system_serial_number); int uv_type; +EXPORT_SYMBOL_GPL(uv_type); s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, - long *region) + long *region, long *ssn) { s64 ret; u64 v0, v1; @@ -97,8 +103,11 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, *coher = part.coherence_id; if (region) *region = part.region_size; + if (ssn) + *ssn = v1; return ret; } +EXPORT_SYMBOL_GPL(uv_bios_get_sn_info); int uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size, @@ -154,6 +163,25 @@ s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) } EXPORT_SYMBOL_GPL(uv_bios_freq_base); +/* + * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target + * @decode: true to enable target, false to disable target + * @domain: PCI domain number + * @bus: PCI bus number + * + * Returns: + * 0: Success + * -EINVAL: Invalid domain or bus number + * -ENOSYS: Capability not available + * -EBUSY: Legacy VGA I/O cannot be retargeted at this time + */ +int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) +{ + return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET, + (u64)decode, (u64)domain, (u64)bus, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); + #ifdef CONFIG_EFI void uv_bios_init(void) @@ -185,4 +213,3 @@ void uv_bios_init(void) void uv_bios_init(void) { } #endif - diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c index 30f25a75fe2..5de7f4c5697 100644 --- a/arch/x86/kernel/bootflag.c +++ b/arch/x86/kernel/bootflag.c @@ -5,7 +5,6 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/string.h> -#include <linux/slab.h> #include <linux/spinlock.h> #include <linux/acpi.h> #include <asm/io.h> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 1d2cb383410..c202b62f367 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -19,8 +19,6 @@ obj-y += vmware.o hypervisor.o sched.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o -obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o - obj-$(CONFIG_CPU_SUP_INTEL) += intel.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 468489b57aa..97ad79cdf68 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -32,6 +32,10 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, + { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, + { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, + { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, + { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, { 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c deleted file mode 100644 index b368cd86299..00000000000 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ /dev/null @@ -1,688 +0,0 @@ -/* - * CPU x86 architecture debug code - * - * Copyright(C) 2009 Jaswinder Singh Rajput - * - * For licencing details see kernel-base/COPYING - */ - -#include <linux/interrupt.h> -#include <linux/compiler.h> -#include <linux/seq_file.h> -#include <linux/debugfs.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/percpu.h> -#include <linux/signal.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/smp.h> - -#include <asm/cpu_debug.h> -#include <asm/paravirt.h> -#include <asm/system.h> -#include <asm/traps.h> -#include <asm/apic.h> -#include <asm/desc.h> - -static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpud_arr); -static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], cpud_priv_arr); -static DEFINE_PER_CPU(int, cpud_priv_count); - -static DEFINE_MUTEX(cpu_debug_lock); - -static struct dentry *cpu_debugfs_dir; - -static struct cpu_debug_base cpu_base[] = { - { "mc", CPU_MC, 0 }, - { "monitor", CPU_MONITOR, 0 }, - { "time", CPU_TIME, 0 }, - { "pmc", CPU_PMC, 1 }, - { "platform", CPU_PLATFORM, 0 }, - { "apic", CPU_APIC, 0 }, - { "poweron", CPU_POWERON, 0 }, - { "control", CPU_CONTROL, 0 }, - { "features", CPU_FEATURES, 0 }, - { "lastbranch", CPU_LBRANCH, 0 }, - { "bios", CPU_BIOS, 0 }, - { "freq", CPU_FREQ, 0 }, - { "mtrr", CPU_MTRR, 0 }, - { "perf", CPU_PERF, 0 }, - { "cache", CPU_CACHE, 0 }, - { "sysenter", CPU_SYSENTER, 0 }, - { "therm", CPU_THERM, 0 }, - { "misc", CPU_MISC, 0 }, - { "debug", CPU_DEBUG, 0 }, - { "pat", CPU_PAT, 0 }, - { "vmx", CPU_VMX, 0 }, - { "call", CPU_CALL, 0 }, - { "base", CPU_BASE, 0 }, - { "ver", CPU_VER, 0 }, - { "conf", CPU_CONF, 0 }, - { "smm", CPU_SMM, 0 }, - { "svm", CPU_SVM, 0 }, - { "osvm", CPU_OSVM, 0 }, - { "tss", CPU_TSS, 0 }, - { "cr", CPU_CR, 0 }, - { "dt", CPU_DT, 0 }, - { "registers", CPU_REG_ALL, 0 }, -}; - -static struct cpu_file_base cpu_file[] = { - { "index", CPU_REG_ALL, 0 }, - { "value", CPU_REG_ALL, 1 }, -}; - -/* CPU Registers Range */ -static struct cpu_debug_range cpu_reg_range[] = { - { 0x00000000, 0x00000001, CPU_MC, }, - { 0x00000006, 0x00000007, CPU_MONITOR, }, - { 0x00000010, 0x00000010, CPU_TIME, }, - { 0x00000011, 0x00000013, CPU_PMC, }, - { 0x00000017, 0x00000017, CPU_PLATFORM, }, - { 0x0000001B, 0x0000001B, CPU_APIC, }, - { 0x0000002A, 0x0000002B, CPU_POWERON, }, - { 0x0000002C, 0x0000002C, CPU_FREQ, }, - { 0x0000003A, 0x0000003A, CPU_CONTROL, }, - { 0x00000040, 0x00000047, CPU_LBRANCH, }, - { 0x00000060, 0x00000067, CPU_LBRANCH, }, - { 0x00000079, 0x00000079, CPU_BIOS, }, - { 0x00000088, 0x0000008A, CPU_CACHE, }, - { 0x0000008B, 0x0000008B, CPU_BIOS, }, - { 0x0000009B, 0x0000009B, CPU_MONITOR, }, - { 0x000000C1, 0x000000C4, CPU_PMC, }, - { 0x000000CD, 0x000000CD, CPU_FREQ, }, - { 0x000000E7, 0x000000E8, CPU_PERF, }, - { 0x000000FE, 0x000000FE, CPU_MTRR, }, - - { 0x00000116, 0x0000011E, CPU_CACHE, }, - { 0x00000174, 0x00000176, CPU_SYSENTER, }, - { 0x00000179, 0x0000017B, CPU_MC, }, - { 0x00000186, 0x00000189, CPU_PMC, }, - { 0x00000198, 0x00000199, CPU_PERF, }, - { 0x0000019A, 0x0000019A, CPU_TIME, }, - { 0x0000019B, 0x0000019D, CPU_THERM, }, - { 0x000001A0, 0x000001A0, CPU_MISC, }, - { 0x000001C9, 0x000001C9, CPU_LBRANCH, }, - { 0x000001D7, 0x000001D8, CPU_LBRANCH, }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, }, - { 0x000001DA, 0x000001E0, CPU_LBRANCH, }, - - { 0x00000200, 0x0000020F, CPU_MTRR, }, - { 0x00000250, 0x00000250, CPU_MTRR, }, - { 0x00000258, 0x00000259, CPU_MTRR, }, - { 0x00000268, 0x0000026F, CPU_MTRR, }, - { 0x00000277, 0x00000277, CPU_PAT, }, - { 0x000002FF, 0x000002FF, CPU_MTRR, }, - - { 0x00000300, 0x00000311, CPU_PMC, }, - { 0x00000345, 0x00000345, CPU_PMC, }, - { 0x00000360, 0x00000371, CPU_PMC, }, - { 0x0000038D, 0x00000390, CPU_PMC, }, - { 0x000003A0, 0x000003BE, CPU_PMC, }, - { 0x000003C0, 0x000003CD, CPU_PMC, }, - { 0x000003E0, 0x000003E1, CPU_PMC, }, - { 0x000003F0, 0x000003F2, CPU_PMC, }, - - { 0x00000400, 0x00000417, CPU_MC, }, - { 0x00000480, 0x0000048B, CPU_VMX, }, - - { 0x00000600, 0x00000600, CPU_DEBUG, }, - { 0x00000680, 0x0000068F, CPU_LBRANCH, }, - { 0x000006C0, 0x000006CF, CPU_LBRANCH, }, - - { 0x000107CC, 0x000107D3, CPU_PMC, }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, }, - { 0xC0000081, 0xC0000084, CPU_CALL, }, - { 0xC0000100, 0xC0000102, CPU_BASE, }, - { 0xC0000103, 0xC0000103, CPU_TIME, }, - - { 0xC0010000, 0xC0010007, CPU_PMC, }, - { 0xC0010010, 0xC0010010, CPU_CONF, }, - { 0xC0010015, 0xC0010015, CPU_CONF, }, - { 0xC0010016, 0xC001001A, CPU_MTRR, }, - { 0xC001001D, 0xC001001D, CPU_MTRR, }, - { 0xC001001F, 0xC001001F, CPU_CONF, }, - { 0xC0010030, 0xC0010035, CPU_BIOS, }, - { 0xC0010044, 0xC0010048, CPU_MC, }, - { 0xC0010050, 0xC0010056, CPU_SMM, }, - { 0xC0010058, 0xC0010058, CPU_CONF, }, - { 0xC0010060, 0xC0010060, CPU_CACHE, }, - { 0xC0010061, 0xC0010068, CPU_SMM, }, - { 0xC0010069, 0xC001006B, CPU_SMM, }, - { 0xC0010070, 0xC0010071, CPU_SMM, }, - { 0xC0010111, 0xC0010113, CPU_SMM, }, - { 0xC0010114, 0xC0010118, CPU_SVM, }, - { 0xC0010140, 0xC0010141, CPU_OSVM, }, - { 0xC0011022, 0xC0011023, CPU_CONF, }, -}; - -static int is_typeflag_valid(unsigned cpu, unsigned flag) -{ - int i; - - /* Standard Registers should be always valid */ - if (flag >= CPU_TSS) - return 1; - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (cpu_reg_range[i].flag == flag) - return 1; - } - - /* Invalid */ - return 0; -} - -static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, - int index, unsigned flag) -{ - if (cpu_reg_range[index].flag == flag) { - *min = cpu_reg_range[index].min; - *max = cpu_reg_range[index].max; - } else - *max = 0; - - return *max; -} - -/* This function can also be called with seq = NULL for printk */ -static void print_cpu_data(struct seq_file *seq, unsigned type, - u32 low, u32 high) -{ - struct cpu_private *priv; - u64 val = high; - - if (seq) { - priv = seq->private; - if (priv->file) { - val = (val << 32) | low; - seq_printf(seq, "0x%llx\n", val); - } else - seq_printf(seq, " %08x: %08x_%08x\n", - type, high, low); - } else - printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low); -} - -/* This function can also be called with seq = NULL for printk */ -static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) -{ - unsigned msr, msr_min, msr_max; - struct cpu_private *priv; - u32 low, high; - int i; - - if (seq) { - priv = seq->private; - if (priv->file) { - if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg, - &low, &high)) - print_cpu_data(seq, priv->reg, low, high); - return; - } - } - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) - continue; - - for (msr = msr_min; msr <= msr_max; msr++) { - if (rdmsr_safe_on_cpu(cpu, msr, &low, &high)) - continue; - print_cpu_data(seq, msr, low, high); - } - } -} - -static void print_tss(void *arg) -{ - struct pt_regs *regs = task_pt_regs(current); - struct seq_file *seq = arg; - unsigned int seg; - - seq_printf(seq, " RAX\t: %016lx\n", regs->ax); - seq_printf(seq, " RBX\t: %016lx\n", regs->bx); - seq_printf(seq, " RCX\t: %016lx\n", regs->cx); - seq_printf(seq, " RDX\t: %016lx\n", regs->dx); - - seq_printf(seq, " RSI\t: %016lx\n", regs->si); - seq_printf(seq, " RDI\t: %016lx\n", regs->di); - seq_printf(seq, " RBP\t: %016lx\n", regs->bp); - seq_printf(seq, " ESP\t: %016lx\n", regs->sp); - -#ifdef CONFIG_X86_64 - seq_printf(seq, " R08\t: %016lx\n", regs->r8); - seq_printf(seq, " R09\t: %016lx\n", regs->r9); - seq_printf(seq, " R10\t: %016lx\n", regs->r10); - seq_printf(seq, " R11\t: %016lx\n", regs->r11); - seq_printf(seq, " R12\t: %016lx\n", regs->r12); - seq_printf(seq, " R13\t: %016lx\n", regs->r13); - seq_printf(seq, " R14\t: %016lx\n", regs->r14); - seq_printf(seq, " R15\t: %016lx\n", regs->r15); -#endif - - asm("movl %%cs,%0" : "=r" (seg)); - seq_printf(seq, " CS\t: %04x\n", seg); - asm("movl %%ds,%0" : "=r" (seg)); - seq_printf(seq, " DS\t: %04x\n", seg); - seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff); - asm("movl %%es,%0" : "=r" (seg)); - seq_printf(seq, " ES\t: %04x\n", seg); - asm("movl %%fs,%0" : "=r" (seg)); - seq_printf(seq, " FS\t: %04x\n", seg); - asm("movl %%gs,%0" : "=r" (seg)); - seq_printf(seq, " GS\t: %04x\n", seg); - - seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags); - - seq_printf(seq, " EIP\t: %016lx\n", regs->ip); -} - -static void print_cr(void *arg) -{ - struct seq_file *seq = arg; - - seq_printf(seq, " cr0\t: %016lx\n", read_cr0()); - seq_printf(seq, " cr2\t: %016lx\n", read_cr2()); - seq_printf(seq, " cr3\t: %016lx\n", read_cr3()); - seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe()); -#ifdef CONFIG_X86_64 - seq_printf(seq, " cr8\t: %016lx\n", read_cr8()); -#endif -} - -static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt) -{ - seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size)); -} - -static void print_dt(void *seq) -{ - struct desc_ptr dt; - unsigned long ldt; - - /* IDT */ - store_idt((struct desc_ptr *)&dt); - print_desc_ptr("IDT", seq, dt); - - /* GDT */ - store_gdt((struct desc_ptr *)&dt); - print_desc_ptr("GDT", seq, dt); - - /* LDT */ - store_ldt(ldt); - seq_printf(seq, " LDT\t: %016lx\n", ldt); - - /* TR */ - store_tr(ldt); - seq_printf(seq, " TR\t: %016lx\n", ldt); -} - -static void print_dr(void *arg) -{ - struct seq_file *seq = arg; - unsigned long dr; - int i; - - for (i = 0; i < 8; i++) { - /* Ignore db4, db5 */ - if ((i == 4) || (i == 5)) - continue; - get_debugreg(dr, i); - seq_printf(seq, " dr%d\t: %016lx\n", i, dr); - } - - seq_printf(seq, "\n MSR\t:\n"); -} - -static void print_apic(void *arg) -{ - struct seq_file *seq = arg; - -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(seq, " LAPIC\t:\n"); - seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24); - seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR)); - seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI)); - seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI)); - seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI)); - seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR)); - seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR)); - seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV)); - seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR)); - seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR)); - seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR)); - seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2)); - seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT)); - seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR)); - seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC)); - seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0)); - seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1)); - seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR)); - seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); - seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); - seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); - if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { - unsigned int i, v, maxeilvt; - - v = apic_read(APIC_EFEAT); - maxeilvt = (v >> 16) & 0xff; - seq_printf(seq, " EFEAT\t\t: %08x\n", v); - seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL)); - - for (i = 0; i < maxeilvt; i++) { - v = apic_read(APIC_EILVTn(i)); - seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v); - } - } -#endif /* CONFIG_X86_LOCAL_APIC */ - seq_printf(seq, "\n MSR\t:\n"); -} - -static int cpu_seq_show(struct seq_file *seq, void *v) -{ - struct cpu_private *priv = seq->private; - - if (priv == NULL) - return -EINVAL; - - switch (cpu_base[priv->type].flag) { - case CPU_TSS: - smp_call_function_single(priv->cpu, print_tss, seq, 1); - break; - case CPU_CR: - smp_call_function_single(priv->cpu, print_cr, seq, 1); - break; - case CPU_DT: - smp_call_function_single(priv->cpu, print_dt, seq, 1); - break; - case CPU_DEBUG: - if (priv->file == CPU_INDEX_BIT) - smp_call_function_single(priv->cpu, print_dr, seq, 1); - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - case CPU_APIC: - if (priv->file == CPU_INDEX_BIT) - smp_call_function_single(priv->cpu, print_apic, seq, 1); - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - - default: - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - } - seq_printf(seq, "\n"); - - return 0; -} - -static void *cpu_seq_start(struct seq_file *seq, loff_t *pos) -{ - if (*pos == 0) /* One time is enough ;-) */ - return seq; - - return NULL; -} - -static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - (*pos)++; - - return cpu_seq_start(seq, pos); -} - -static void cpu_seq_stop(struct seq_file *seq, void *v) -{ -} - -static const struct seq_operations cpu_seq_ops = { - .start = cpu_seq_start, - .next = cpu_seq_next, - .stop = cpu_seq_stop, - .show = cpu_seq_show, -}; - -static int cpu_seq_open(struct inode *inode, struct file *file) -{ - struct cpu_private *priv = inode->i_private; - struct seq_file *seq; - int err; - - err = seq_open(file, &cpu_seq_ops); - if (!err) { - seq = file->private_data; - seq->private = priv; - } - - return err; -} - -static int write_msr(struct cpu_private *priv, u64 val) -{ - u32 low, high; - - high = (val >> 32) & 0xffffffff; - low = val & 0xffffffff; - - if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high)) - return 0; - - return -EPERM; -} - -static int write_cpu_register(struct cpu_private *priv, const char *buf) -{ - int ret = -EPERM; - u64 val; - - ret = strict_strtoull(buf, 0, &val); - if (ret < 0) - return ret; - - /* Supporting only MSRs */ - if (priv->type < CPU_TSS_BIT) - return write_msr(priv, val); - - return ret; -} - -static ssize_t cpu_write(struct file *file, const char __user *ubuf, - size_t count, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct cpu_private *priv = seq->private; - char buf[19]; - - if ((priv == NULL) || (count >= sizeof(buf))) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, count)) - return -EFAULT; - - buf[count] = 0; - - if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write)) - if (!write_cpu_register(priv, buf)) - return count; - - return -EACCES; -} - -static const struct file_operations cpu_fops = { - .owner = THIS_MODULE, - .open = cpu_seq_open, - .read = seq_read, - .write = cpu_write, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg, - unsigned file, struct dentry *dentry) -{ - struct cpu_private *priv = NULL; - - /* Already intialized */ - if (file == CPU_INDEX_BIT) - if (per_cpu(cpud_arr[type].init, cpu)) - return 0; - - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (priv == NULL) - return -ENOMEM; - - priv->cpu = cpu; - priv->type = type; - priv->reg = reg; - priv->file = file; - mutex_lock(&cpu_debug_lock); - per_cpu(cpud_priv_arr[type], cpu) = priv; - per_cpu(cpud_priv_count, cpu)++; - mutex_unlock(&cpu_debug_lock); - - if (file) - debugfs_create_file(cpu_file[file].name, S_IRUGO, - dentry, (void *)priv, &cpu_fops); - else { - debugfs_create_file(cpu_base[type].name, S_IRUGO, - per_cpu(cpud_arr[type].dentry, cpu), - (void *)priv, &cpu_fops); - mutex_lock(&cpu_debug_lock); - per_cpu(cpud_arr[type].init, cpu) = 1; - mutex_unlock(&cpu_debug_lock); - } - - return 0; -} - -static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg, - struct dentry *dentry) -{ - unsigned file; - int err = 0; - - for (file = 0; file < ARRAY_SIZE(cpu_file); file++) { - err = cpu_create_file(cpu, type, reg, file, dentry); - if (err) - return err; - } - - return err; -} - -static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry) -{ - struct dentry *cpu_dentry = NULL; - unsigned reg, reg_min, reg_max; - int i, err = 0; - char reg_dir[12]; - u32 low, high; - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (!get_cpu_range(cpu, ®_min, ®_max, i, - cpu_base[type].flag)) - continue; - - for (reg = reg_min; reg <= reg_max; reg++) { - if (rdmsr_safe_on_cpu(cpu, reg, &low, &high)) - continue; - - sprintf(reg_dir, "0x%x", reg); - cpu_dentry = debugfs_create_dir(reg_dir, dentry); - err = cpu_init_regfiles(cpu, type, reg, cpu_dentry); - if (err) - return err; - } - } - - return err; -} - -static int cpu_init_allreg(unsigned cpu, struct dentry *dentry) -{ - struct dentry *cpu_dentry = NULL; - unsigned type; - int err = 0; - - for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) { - if (!is_typeflag_valid(cpu, cpu_base[type].flag)) - continue; - cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry); - per_cpu(cpud_arr[type].dentry, cpu) = cpu_dentry; - - if (type < CPU_TSS_BIT) - err = cpu_init_msr(cpu, type, cpu_dentry); - else - err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT, - cpu_dentry); - if (err) - return err; - } - - return err; -} - -static int cpu_init_cpu(void) -{ - struct dentry *cpu_dentry = NULL; - struct cpuinfo_x86 *cpui; - char cpu_dir[12]; - unsigned cpu; - int err = 0; - - for (cpu = 0; cpu < nr_cpu_ids; cpu++) { - cpui = &cpu_data(cpu); - if (!cpu_has(cpui, X86_FEATURE_MSR)) - continue; - - sprintf(cpu_dir, "cpu%d", cpu); - cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); - err = cpu_init_allreg(cpu, cpu_dentry); - - pr_info("cpu%d(%d) debug files %d\n", - cpu, nr_cpu_ids, per_cpu(cpud_priv_count, cpu)); - if (per_cpu(cpud_priv_count, cpu) > MAX_CPU_FILES) { - pr_err("Register files count %d exceeds limit %d\n", - per_cpu(cpud_priv_count, cpu), MAX_CPU_FILES); - per_cpu(cpud_priv_count, cpu) = MAX_CPU_FILES; - err = -ENFILE; - } - if (err) - return err; - } - - return err; -} - -static int __init cpu_debug_init(void) -{ - cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir); - - return cpu_init_cpu(); -} - -static void __exit cpu_debug_exit(void) -{ - int i, cpu; - - if (cpu_debugfs_dir) - debugfs_remove_recursive(cpu_debugfs_dir); - - for (cpu = 0; cpu < nr_cpu_ids; cpu++) - for (i = 0; i < per_cpu(cpud_priv_count, cpu); i++) - kfree(per_cpu(cpud_priv_arr[i], cpu)); -} - -module_init(cpu_debug_init); -module_exit(cpu_debug_exit); - -MODULE_AUTHOR("Jaswinder Singh Rajput"); -MODULE_DESCRIPTION("CPU Debug module"); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index f138c6c389b..870e6cc6ad2 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -10,6 +10,20 @@ if CPU_FREQ comment "CPUFreq processor drivers" +config X86_PCC_CPUFREQ + tristate "Processor Clocking Control interface driver" + depends on ACPI && ACPI_PROCESSOR + help + This driver adds support for the PCC interface. + + For details, take a look at: + <file:Documentation/cpu-freq/pcc-cpufreq.txt>. + + To compile this driver as a module, choose M here: the + module will be called pcc-cpufreq. + + If in doubt, say N. + config X86_ACPI_CPUFREQ tristate "ACPI Processor P-States driver" select CPU_FREQ_TABLE diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile index 509296df294..1840c0a5170 100644 --- a/arch/x86/kernel/cpu/cpufreq/Makefile +++ b/arch/x86/kernel/cpu/cpufreq/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o +obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o obj-$(CONFIG_X86_LONGHAUL) += longhaul.o diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 1b1920fa7c8..459168083b7 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,6 +33,7 @@ #include <linux/cpufreq.h> #include <linux/compiler.h> #include <linux/dmi.h> +#include <linux/slab.h> #include <trace/events/power.h> #include <linux/acpi.h> diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c index 006b278b0d5..c587db472a7 100644 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c @@ -20,7 +20,6 @@ #include <linux/module.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/delay.h> #include <linux/cpufreq.h> diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index ac27ec2264d..16e3483be9e 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -80,6 +80,7 @@ #include <linux/cpufreq.h> #include <linux/pci.h> #include <linux/errno.h> +#include <linux/slab.h> #include <asm/processor-cyrix.h> diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index da5f70fcb76..e7b559d74c5 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c @@ -9,7 +9,6 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/cpufreq.h> #include <linux/timex.h> diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 86961519372..7b8a8ba67b0 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -25,7 +25,6 @@ #include <linux/init.h> #include <linux/smp.h> #include <linux/cpufreq.h> -#include <linux/slab.h> #include <linux/cpumask.h> #include <linux/timex.h> diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c new file mode 100644 index 00000000000..ce7cde713e7 --- /dev/null +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -0,0 +1,621 @@ +/* + * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface + * + * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com> + * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. + * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON + * INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 675 Mass Ave, Cambridge, MA 02139, USA. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/sched.h> +#include <linux/cpufreq.h> +#include <linux/compiler.h> +#include <linux/slab.h> + +#include <linux/acpi.h> +#include <linux/io.h> +#include <linux/spinlock.h> +#include <linux/uaccess.h> + +#include <acpi/processor.h> + +#define PCC_VERSION "1.00.00" +#define POLL_LOOPS 300 + +#define CMD_COMPLETE 0x1 +#define CMD_GET_FREQ 0x0 +#define CMD_SET_FREQ 0x1 + +#define BUF_SZ 4 + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ + "pcc-cpufreq", msg) + +struct pcc_register_resource { + u8 descriptor; + u16 length; + u8 space_id; + u8 bit_width; + u8 bit_offset; + u8 access_size; + u64 address; +} __attribute__ ((packed)); + +struct pcc_memory_resource { + u8 descriptor; + u16 length; + u8 space_id; + u8 resource_usage; + u8 type_specific; + u64 granularity; + u64 minimum; + u64 maximum; + u64 translation_offset; + u64 address_length; +} __attribute__ ((packed)); + +static struct cpufreq_driver pcc_cpufreq_driver; + +struct pcc_header { + u32 signature; + u16 length; + u8 major; + u8 minor; + u32 features; + u16 command; + u16 status; + u32 latency; + u32 minimum_time; + u32 maximum_time; + u32 nominal; + u32 throttled_frequency; + u32 minimum_frequency; +}; + +static void __iomem *pcch_virt_addr; +static struct pcc_header __iomem *pcch_hdr; + +static DEFINE_SPINLOCK(pcc_lock); + +static struct acpi_generic_address doorbell; + +static u64 doorbell_preserve; +static u64 doorbell_write; + +static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f, + 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46}; + +struct pcc_cpu { + u32 input_offset; + u32 output_offset; +}; + +static struct pcc_cpu *pcc_cpu_info; + +static int pcc_cpufreq_verify(struct cpufreq_policy *policy) +{ + cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, + policy->cpuinfo.max_freq); + return 0; +} + +static inline void pcc_cmd(void) +{ + u64 doorbell_value; + int i; + + acpi_read(&doorbell_value, &doorbell); + acpi_write((doorbell_value & doorbell_preserve) | doorbell_write, + &doorbell); + + for (i = 0; i < POLL_LOOPS; i++) { + if (ioread16(&pcch_hdr->status) & CMD_COMPLETE) + break; + } +} + +static inline void pcc_clear_mapping(void) +{ + if (pcch_virt_addr) + iounmap(pcch_virt_addr); + pcch_virt_addr = NULL; +} + +static unsigned int pcc_get_freq(unsigned int cpu) +{ + struct pcc_cpu *pcc_cpu_data; + unsigned int curr_freq; + unsigned int freq_limit; + u16 status; + u32 input_buffer; + u32 output_buffer; + + spin_lock(&pcc_lock); + + dprintk("get: get_freq for CPU %d\n", cpu); + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + input_buffer = 0x1; + iowrite32(input_buffer, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + iowrite16(CMD_GET_FREQ, &pcch_hdr->command); + + pcc_cmd(); + + output_buffer = + ioread32(pcch_virt_addr + pcc_cpu_data->output_offset); + + /* Clear the input buffer - we are done with the current command */ + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + + status = ioread16(&pcch_hdr->status); + if (status != CMD_COMPLETE) { + dprintk("get: FAILED: for CPU %d, status is %d\n", + cpu, status); + goto cmd_incomplete; + } + iowrite16(0, &pcch_hdr->status); + curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff)) + / 100) * 1000); + + dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is " + "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n", + cpu, (pcch_virt_addr + pcc_cpu_data->output_offset), + output_buffer, curr_freq); + + freq_limit = (output_buffer >> 8) & 0xff; + if (freq_limit != 0xff) { + dprintk("get: frequency for cpu %d is being temporarily" + " capped at %d\n", cpu, curr_freq); + } + + spin_unlock(&pcc_lock); + return curr_freq; + +cmd_incomplete: + iowrite16(0, &pcch_hdr->status); + spin_unlock(&pcc_lock); + return -EINVAL; +} + +static int pcc_cpufreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct pcc_cpu *pcc_cpu_data; + struct cpufreq_freqs freqs; + u16 status; + u32 input_buffer; + int cpu; + + spin_lock(&pcc_lock); + cpu = policy->cpu; + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + dprintk("target: CPU %d should go to target freq: %d " + "(virtual) input_offset is 0x%x\n", + cpu, target_freq, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + + freqs.new = target_freq; + freqs.cpu = cpu; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + input_buffer = 0x1 | (((target_freq * 100) + / (ioread32(&pcch_hdr->nominal) * 1000)) << 8); + iowrite32(input_buffer, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + iowrite16(CMD_SET_FREQ, &pcch_hdr->command); + + pcc_cmd(); + + /* Clear the input buffer - we are done with the current command */ + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + + status = ioread16(&pcch_hdr->status); + if (status != CMD_COMPLETE) { + dprintk("target: FAILED for cpu %d, with status: 0x%x\n", + cpu, status); + goto cmd_incomplete; + } + iowrite16(0, &pcch_hdr->status); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + dprintk("target: was SUCCESSFUL for cpu %d\n", cpu); + spin_unlock(&pcc_lock); + + return 0; + +cmd_incomplete: + iowrite16(0, &pcch_hdr->status); + spin_unlock(&pcc_lock); + return -EINVAL; +} + +static int pcc_get_offset(int cpu) +{ + acpi_status status; + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *pccp, *offset; + struct pcc_cpu *pcc_cpu_data; + struct acpi_processor *pr; + int ret = 0; + + pr = per_cpu(processors, cpu); + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; + + pccp = buffer.pointer; + if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) { + ret = -ENODEV; + goto out_free; + }; + + offset = &(pccp->package.elements[0]); + if (!offset || offset->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto out_free; + } + + pcc_cpu_data->input_offset = offset->integer.value; + + offset = &(pccp->package.elements[1]); + if (!offset || offset->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto out_free; + } + + pcc_cpu_data->output_offset = offset->integer.value; + + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ); + + dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data " + "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n", + cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset); +out_free: + kfree(buffer.pointer); + return ret; +} + +static int __init pcc_cpufreq_do_osc(acpi_handle *handle) +{ + acpi_status status; + struct acpi_object_list input; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object in_params[4]; + union acpi_object *out_obj; + u32 capabilities[2]; + u32 errors; + u32 supported; + int ret = 0; + + input.count = 4; + input.pointer = in_params; + input.count = 4; + input.pointer = in_params; + in_params[0].type = ACPI_TYPE_BUFFER; + in_params[0].buffer.length = 16; + in_params[0].buffer.pointer = OSC_UUID; + in_params[1].type = ACPI_TYPE_INTEGER; + in_params[1].integer.value = 1; + in_params[2].type = ACPI_TYPE_INTEGER; + in_params[2].integer.value = 2; + in_params[3].type = ACPI_TYPE_BUFFER; + in_params[3].buffer.length = 8; + in_params[3].buffer.pointer = (u8 *)&capabilities; + + capabilities[0] = OSC_QUERY_ENABLE; + capabilities[1] = 0x1; + + status = acpi_evaluate_object(*handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!output.length) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + ret = -ENODEV; + goto out_free; + } + + supported = *((u32 *)(out_obj->buffer.pointer + 4)); + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } + + kfree(output.pointer); + capabilities[0] = 0x0; + capabilities[1] = 0x1; + + status = acpi_evaluate_object(*handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!output.length) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + ret = -ENODEV; + goto out_free; + } + + supported = *((u32 *)(out_obj->buffer.pointer + 4)); + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } + +out_free: + kfree(output.pointer); + return ret; +} + +static int __init pcc_cpufreq_probe(void) +{ + acpi_status status; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + struct pcc_memory_resource *mem_resource; + struct pcc_register_resource *reg_resource; + union acpi_object *out_obj, *member; + acpi_handle handle, osc_handle; + int ret = 0; + + status = acpi_get_handle(NULL, "\\_SB", &handle); + if (ACPI_FAILURE(status)) + return -ENODEV; + + status = acpi_get_handle(handle, "_OSC", &osc_handle); + if (ACPI_SUCCESS(status)) { + ret = pcc_cpufreq_do_osc(&osc_handle); + if (ret) + dprintk("probe: _OSC evaluation did not succeed\n"); + /* Firmware's use of _OSC is optional */ + ret = 0; + } + + status = acpi_evaluate_object(handle, "PCCH", NULL, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_PACKAGE) { + ret = -ENODEV; + goto out_free; + } + + member = &out_obj->package.elements[0]; + if (member->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + mem_resource = (struct pcc_memory_resource *)member->buffer.pointer; + + dprintk("probe: mem_resource descriptor: 0x%x," + " length: %d, space_id: %d, resource_usage: %d," + " type_specific: %d, granularity: 0x%llx," + " minimum: 0x%llx, maximum: 0x%llx," + " translation_offset: 0x%llx, address_length: 0x%llx\n", + mem_resource->descriptor, mem_resource->length, + mem_resource->space_id, mem_resource->resource_usage, + mem_resource->type_specific, mem_resource->granularity, + mem_resource->minimum, mem_resource->maximum, + mem_resource->translation_offset, + mem_resource->address_length); + + if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) { + ret = -ENODEV; + goto out_free; + } + + pcch_virt_addr = ioremap_nocache(mem_resource->minimum, + mem_resource->address_length); + if (pcch_virt_addr == NULL) { + dprintk("probe: could not map shared mem region\n"); + goto out_free; + } + pcch_hdr = pcch_virt_addr; + + dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr); + dprintk("probe: PCCH header is at physical address: 0x%llx," + " signature: 0x%x, length: %d bytes, major: %d, minor: %d," + " supported features: 0x%x, command field: 0x%x," + " status field: 0x%x, nominal latency: %d us\n", + mem_resource->minimum, ioread32(&pcch_hdr->signature), + ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major), + ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features), + ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status), + ioread32(&pcch_hdr->latency)); + + dprintk("probe: min time between commands: %d us," + " max time between commands: %d us," + " nominal CPU frequency: %d MHz," + " minimum CPU frequency: %d MHz," + " minimum CPU frequency without throttling: %d MHz\n", + ioread32(&pcch_hdr->minimum_time), + ioread32(&pcch_hdr->maximum_time), + ioread32(&pcch_hdr->nominal), + ioread32(&pcch_hdr->throttled_frequency), + ioread32(&pcch_hdr->minimum_frequency)); + + member = &out_obj->package.elements[1]; + if (member->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto pcch_free; + } + + reg_resource = (struct pcc_register_resource *)member->buffer.pointer; + + doorbell.space_id = reg_resource->space_id; + doorbell.bit_width = reg_resource->bit_width; + doorbell.bit_offset = reg_resource->bit_offset; + doorbell.access_width = 64; + doorbell.address = reg_resource->address; + + dprintk("probe: doorbell: space_id is %d, bit_width is %d, " + "bit_offset is %d, access_width is %d, address is 0x%llx\n", + doorbell.space_id, doorbell.bit_width, doorbell.bit_offset, + doorbell.access_width, reg_resource->address); + + member = &out_obj->package.elements[2]; + if (member->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto pcch_free; + } + + doorbell_preserve = member->integer.value; + + member = &out_obj->package.elements[3]; + if (member->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto pcch_free; + } + + doorbell_write = member->integer.value; + + dprintk("probe: doorbell_preserve: 0x%llx," + " doorbell_write: 0x%llx\n", + doorbell_preserve, doorbell_write); + + pcc_cpu_info = alloc_percpu(struct pcc_cpu); + if (!pcc_cpu_info) { + ret = -ENOMEM; + goto pcch_free; + } + + printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency" + " limits: %d MHz, %d MHz\n", PCC_VERSION, + ioread32(&pcch_hdr->minimum_frequency), + ioread32(&pcch_hdr->nominal)); + kfree(output.pointer); + return ret; +pcch_free: + pcc_clear_mapping(); +out_free: + kfree(output.pointer); + return ret; +} + +static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + unsigned int result = 0; + + if (!pcch_virt_addr) { + result = -1; + goto pcch_null; + } + + result = pcc_get_offset(cpu); + if (result) { + dprintk("init: PCCP evaluation failed\n"); + goto free; + } + + policy->max = policy->cpuinfo.max_freq = + ioread32(&pcch_hdr->nominal) * 1000; + policy->min = policy->cpuinfo.min_freq = + ioread32(&pcch_hdr->minimum_frequency) * 1000; + policy->cur = pcc_get_freq(cpu); + + dprintk("init: policy->max is %d, policy->min is %d\n", + policy->max, policy->min); + + return 0; +free: + pcc_clear_mapping(); + free_percpu(pcc_cpu_info); +pcch_null: + return result; +} + +static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy) +{ + return 0; +} + +static struct cpufreq_driver pcc_cpufreq_driver = { + .flags = CPUFREQ_CONST_LOOPS, + .get = pcc_get_freq, + .verify = pcc_cpufreq_verify, + .target = pcc_cpufreq_target, + .init = pcc_cpufreq_cpu_init, + .exit = pcc_cpufreq_cpu_exit, + .name = "pcc-cpufreq", + .owner = THIS_MODULE, +}; + +static int __init pcc_cpufreq_init(void) +{ + int ret; + + if (acpi_disabled) + return 0; + + ret = pcc_cpufreq_probe(); + if (ret) { + dprintk("pcc_cpufreq_init: PCCH evaluation failed\n"); + return ret; + } + + ret = cpufreq_register_driver(&pcc_cpufreq_driver); + + return ret; +} + +static void __exit pcc_cpufreq_exit(void) +{ + cpufreq_unregister_driver(&pcc_cpufreq_driver); + + pcc_clear_mapping(); + + free_percpu(pcc_cpu_info); +} + +MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar"); +MODULE_VERSION(PCC_VERSION); +MODULE_DESCRIPTION("Processor Clocking Control interface driver"); +MODULE_LICENSE("GPL"); + +late_initcall(pcc_cpufreq_init); +module_exit(pcc_cpufreq_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index cb01dac267d..b3379d6a5c5 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -13,7 +13,6 @@ #include <linux/init.h> #include <linux/cpufreq.h> #include <linux/ioport.h> -#include <linux/slab.h> #include <linux/timex.h> #include <linux/io.h> diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index f125e5c551c..d360b56e982 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -806,7 +806,7 @@ static int find_psb_table(struct powernow_k8_data *data) static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { - acpi_integer control; + u64 control; if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) return; @@ -824,7 +824,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { struct cpufreq_frequency_table *powernow_table; int ret_val = -ENODEV; - acpi_integer control, status; + u64 control, status; if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { dprintk("register performance failed: bad ACPI data\n"); @@ -948,7 +948,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, u32 fid; u32 vid; u32 freq, index; - acpi_integer status, control; + u64 status, control; if (data->exttype) { status = data->acpi_data.states[i].status; @@ -1356,6 +1356,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) kfree(data->powernow_table); kfree(data); + per_cpu(powernow_data, pol->cpu) = NULL; return 0; } @@ -1375,7 +1376,7 @@ static unsigned int powernowk8_get(unsigned int cpu) int err; if (!data) - return -EINVAL; + return 0; smp_call_function_single(cpu, query_values_on_cpu, &err, true); if (err) diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 8d672ef162c..9b1ff37de46 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -20,6 +20,7 @@ #include <linux/sched.h> /* current */ #include <linux/delay.h> #include <linux/compiler.h> +#include <linux/gfp.h> #include <asm/msr.h> #include <asm/processor.h> diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 2ce8e0b5cc5..561758e9518 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -23,7 +23,6 @@ #include <linux/init.h> #include <linux/cpufreq.h> #include <linux/pci.h> -#include <linux/slab.h> #include <linux/sched.h> #include "speedstep-lib.h" diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index ad0083abfa2..a94ec6be69f 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -13,7 +13,6 @@ #include <linux/moduleparam.h> #include <linux/init.h> #include <linux/cpufreq.h> -#include <linux/slab.h> #include <asm/msr.h> #include <asm/tsc.h> diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c index 04d73c114e4..8abd869baab 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c @@ -17,7 +17,6 @@ #include <linux/moduleparam.h> #include <linux/init.h> #include <linux/cpufreq.h> -#include <linux/slab.h> #include <linux/delay.h> #include <linux/io.h> #include <asm/ist.h> diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 879666f4d87..7e1cca13af3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -70,7 +70,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - sched_clock_stable = 1; + if (!check_tsc_unstable()) + sched_clock_stable = 1; } /* diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index c2b722d5a72..b3eeb66c0a5 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -18,6 +18,7 @@ #include <asm/processor.h> #include <linux/smp.h> #include <asm/k8.h> +#include <asm/smp.h> #define LVL_1_INST 1 #define LVL_1_DATA 2 @@ -152,7 +153,8 @@ struct _cpuid4_info { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - unsigned long can_disable; + bool can_disable; + unsigned int l3_indices; DECLARE_BITMAP(shared_cpu_map, NR_CPUS); }; @@ -162,7 +164,8 @@ struct _cpuid4_info_regs { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - unsigned long can_disable; + bool can_disable; + unsigned int l3_indices; }; unsigned short num_cache_leaves; @@ -292,6 +295,36 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, (ebx->split.ways_of_associativity + 1) - 1; } +struct _cache_attr { + struct attribute attr; + ssize_t (*show)(struct _cpuid4_info *, char *); + ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); +}; + +#ifdef CONFIG_CPU_SUP_AMD +static unsigned int __cpuinit amd_calc_l3_indices(void) +{ + /* + * We're called over smp_call_function_single() and therefore + * are on the correct cpu. + */ + int cpu = smp_processor_id(); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int sc0, sc1, sc2, sc3; + u32 val = 0; + + pci_read_config_dword(dev, 0x1C4, &val); + + /* calculate subcache sizes */ + sc0 = !(val & BIT(0)); + sc1 = !(val & BIT(4)); + sc2 = !(val & BIT(8)) + !(val & BIT(9)); + sc3 = !(val & BIT(12)) + !(val & BIT(13)); + + return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; +} + static void __cpuinit amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) { @@ -301,12 +334,103 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) if (boot_cpu_data.x86 == 0x11) return; - /* see erratum #382 */ - if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) + /* see errata #382 and #388 */ + if ((boot_cpu_data.x86 == 0x10) && + ((boot_cpu_data.x86_model < 0x8) || + (boot_cpu_data.x86_mask < 0x1))) return; - this_leaf->can_disable = 1; + this_leaf->can_disable = true; + this_leaf->l3_indices = amd_calc_l3_indices(); +} + +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, + unsigned int index) +{ + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = amd_get_nb_id(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int reg = 0; + + if (!this_leaf->can_disable) + return -EINVAL; + + if (!dev) + return -EINVAL; + + pci_read_config_dword(dev, 0x1BC + index * 4, ®); + return sprintf(buf, "0x%08x\n", reg); +} + +#define SHOW_CACHE_DISABLE(index) \ +static ssize_t \ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +{ \ + return show_cache_disable(this_leaf, buf, index); \ +} +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, unsigned int index) +{ + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = amd_get_nb_id(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned long val = 0; + +#define SUBCACHE_MASK (3UL << 20) +#define SUBCACHE_INDEX 0xfff + + if (!this_leaf->can_disable) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!dev) + return -EINVAL; + + if (strict_strtoul(buf, 10, &val) < 0) + return -EINVAL; + + /* do not allow writes outside of allowed bits */ + if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || + ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) + return -EINVAL; + + val |= BIT(30); + pci_write_config_dword(dev, 0x1BC + index * 4, val); + /* + * We need to WBINVD on a core on the node containing the L3 cache which + * indices we disable therefore a simple wbinvd() is not sufficient. + */ + wbinvd_on_cpu(cpu); + pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31)); + return count; +} + +#define STORE_CACHE_DISABLE(index) \ +static ssize_t \ +store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ + const char *buf, size_t count) \ +{ \ + return store_cache_disable(this_leaf, buf, count, index); \ } +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, + show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, + show_cache_disable_1, store_cache_disable_1); + +#else /* CONFIG_CPU_SUP_AMD */ +static void __cpuinit +amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +{ +}; +#endif /* CONFIG_CPU_SUP_AMD */ static int __cpuinit cpuid4_cache_lookup_regs(int index, @@ -713,82 +837,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, - unsigned int index) -{ - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = cpu_to_node(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); - unsigned int reg = 0; - - if (!this_leaf->can_disable) - return -EINVAL; - - if (!dev) - return -EINVAL; - - pci_read_config_dword(dev, 0x1BC + index * 4, ®); - return sprintf(buf, "%x\n", reg); -} - -#define SHOW_CACHE_DISABLE(index) \ -static ssize_t \ -show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ -{ \ - return show_cache_disable(this_leaf, buf, index); \ -} -SHOW_CACHE_DISABLE(0) -SHOW_CACHE_DISABLE(1) - -static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, - const char *buf, size_t count, unsigned int index) -{ - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = cpu_to_node(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); - unsigned long val = 0; - unsigned int scrubber = 0; - - if (!this_leaf->can_disable) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!dev) - return -EINVAL; - - if (strict_strtoul(buf, 10, &val) < 0) - return -EINVAL; - - val |= 0xc0000000; - - pci_read_config_dword(dev, 0x58, &scrubber); - scrubber &= ~0x1f000000; - pci_write_config_dword(dev, 0x58, scrubber); - - pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); - wbinvd(); - pci_write_config_dword(dev, 0x1BC + index * 4, val); - return count; -} - -#define STORE_CACHE_DISABLE(index) \ -static ssize_t \ -store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count) \ -{ \ - return store_cache_disable(this_leaf, buf, count, index); \ -} -STORE_CACHE_DISABLE(0) -STORE_CACHE_DISABLE(1) - -struct _cache_attr { - struct attribute attr; - ssize_t (*show)(struct _cpuid4_info *, char *); - ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); -}; - #define define_one_ro(_name) \ static struct _cache_attr _name = \ __ATTR(_name, 0444, show_##_name, NULL) @@ -803,23 +851,28 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, - show_cache_disable_0, store_cache_disable_0); -static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, - show_cache_disable_1, store_cache_disable_1); +#define DEFAULT_SYSFS_CACHE_ATTRS \ + &type.attr, \ + &level.attr, \ + &coherency_line_size.attr, \ + &physical_line_partition.attr, \ + &ways_of_associativity.attr, \ + &number_of_sets.attr, \ + &size.attr, \ + &shared_cpu_map.attr, \ + &shared_cpu_list.attr static struct attribute *default_attrs[] = { - &type.attr, - &level.attr, - &coherency_line_size.attr, - &physical_line_partition.attr, - &ways_of_associativity.attr, - &number_of_sets.attr, - &size.attr, - &shared_cpu_map.attr, - &shared_cpu_list.attr, + DEFAULT_SYSFS_CACHE_ATTRS, + NULL +}; + +static struct attribute *default_l3_attrs[] = { + DEFAULT_SYSFS_CACHE_ATTRS, +#ifdef CONFIG_CPU_SUP_AMD &cache_disable_0.attr, &cache_disable_1.attr, +#endif NULL }; @@ -850,7 +903,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, return ret; } -static struct sysfs_ops sysfs_ops = { +static const struct sysfs_ops sysfs_ops = { .show = show, .store = store, }; @@ -910,6 +963,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) unsigned int cpu = sys_dev->id; unsigned long i, j; struct _index_kobject *this_object; + struct _cpuid4_info *this_leaf; int retval; retval = cpuid4_cache_sysfs_init(cpu); @@ -928,6 +982,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) this_object = INDEX_KOBJECT_PTR(cpu, i); this_object->cpu = cpu; this_object->index = i; + + this_leaf = CPUID4_INFO_IDX(cpu, i); + + if (this_leaf->can_disable) + ktype_cache.default_attrs = default_l3_attrs; + else + ktype_cache.default_attrs = default_attrs; + retval = kobject_init_and_add(&(this_object->kobj), &ktype_cache, per_cpu(ici_cache_kobject, cpu), diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 73734baa50f..e7dbde7bfed 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -22,6 +22,7 @@ #include <linux/kdebug.h> #include <linux/cpu.h> #include <linux/sched.h> +#include <linux/gfp.h> #include <asm/mce.h> #include <asm/apic.h> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a8aacd4b513..7a355ddcc64 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -26,6 +26,7 @@ #include <linux/sched.h> #include <linux/sysfs.h> #include <linux/types.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/poll.h> @@ -46,6 +47,13 @@ #include "mce-internal.h" +static DEFINE_MUTEX(mce_read_mutex); + +#define rcu_dereference_check_mce(p) \ + rcu_dereference_check((p), \ + rcu_read_lock_sched_held() || \ + lockdep_is_held(&mce_read_mutex)) + #define CREATE_TRACE_POINTS #include <trace/events/mce.h> @@ -158,7 +166,7 @@ void mce_log(struct mce *mce) mce->finished = 0; wmb(); for (;;) { - entry = rcu_dereference(mcelog.next); + entry = rcu_dereference_check_mce(mcelog.next); for (;;) { /* * When the buffer fills up discard new entries. @@ -531,7 +539,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) struct mce m; int i; - __get_cpu_var(mce_poll_count)++; + percpu_inc(mce_poll_count); mce_setup(&m); @@ -926,7 +934,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) atomic_inc(&mce_entry); - __get_cpu_var(mce_exception_count)++; + percpu_inc(mce_exception_count); if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) @@ -1485,8 +1493,6 @@ static void collect_tscs(void *data) rdtscll(cpu_tsc[smp_processor_id()]); } -static DEFINE_MUTEX(mce_read_mutex); - static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { @@ -1500,7 +1506,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, return -ENOMEM; mutex_lock(&mce_read_mutex); - next = rcu_dereference(mcelog.next); + next = rcu_dereference_check_mce(mcelog.next); /* Only supports full reads right now */ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { @@ -1565,7 +1571,7 @@ timeout: static unsigned int mce_poll(struct file *file, poll_table *wait) { poll_wait(file, &mce_wait, wait); - if (rcu_dereference(mcelog.next)) + if (rcu_dereference_check_mce(mcelog.next)) return POLLIN | POLLRDNORM; return 0; } @@ -2044,6 +2050,7 @@ static __init void mce_init_banks(void) struct mce_bank *b = &mce_banks[i]; struct sysdev_attribute *a = &b->attr; + sysfs_attr_init(&a->attr); a->attr.name = b->attrname; snprintf(b->attrname, ATTR_LEN, "bank%d", i); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 83a3d1f4efc..224392d8fe8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -21,6 +21,7 @@ #include <linux/errno.h> #include <linux/sched.h> #include <linux/sysfs.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/cpu.h> #include <linux/smp.h> @@ -388,7 +389,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, return ret; } -static struct sysfs_ops threshold_ops = { +static const struct sysfs_ops threshold_ops = { .show = show, .store = store, }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 7c785634af2..62b48e40920 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -5,6 +5,7 @@ * Author: Andi Kleen */ +#include <linux/gfp.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/percpu.h> @@ -95,7 +96,7 @@ static void cmci_discover(int banks, int boot) /* Already owned by someone else? */ if (val & CMCI_EN) { - if (test_and_clear_bit(i, owned) || boot) + if (test_and_clear_bit(i, owned) && !boot) print_update("SHD", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; @@ -107,7 +108,7 @@ static void cmci_discover(int banks, int boot) /* Did the enable bit stick? -- the bank supports CMCI */ if (val & CMCI_EN) { - if (!test_and_set_bit(i, owned) || boot) + if (!test_and_set_bit(i, owned) && !boot) print_update("CMCI", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); } else { diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile index f4361b56f8e..ad9e5ed8118 100644 --- a/arch/x86/kernel/cpu/mtrr/Makefile +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -1,3 +1,3 @@ -obj-y := main.o if.o generic.o state.o cleanup.o +obj-y := main.o if.o generic.o cleanup.o obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index 33af14110df..92ba9cd31c9 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -108,7 +108,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) return 0; } -static struct mtrr_ops amd_mtrr_ops = { +static const struct mtrr_ops amd_mtrr_ops = { .vendor = X86_VENDOR_AMD, .set = amd_set_mtrr, .get = amd_get_mtrr, diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index de89f14eff3..316fe3e60a9 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -110,7 +110,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t return 0; } -static struct mtrr_ops centaur_mtrr_ops = { +static const struct mtrr_ops centaur_mtrr_ops = { .vendor = X86_VENDOR_CENTAUR, .set = centaur_set_mcr, .get = centaur_get_mcr, diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 09b1698e046..06130b52f01 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -22,10 +22,10 @@ #include <linux/pci.h> #include <linux/smp.h> #include <linux/cpu.h> -#include <linux/sort.h> #include <linux/mutex.h> #include <linux/uaccess.h> #include <linux/kvm_para.h> +#include <linux/range.h> #include <asm/processor.h> #include <asm/e820.h> @@ -34,11 +34,6 @@ #include "mtrr.h" -struct res_range { - unsigned long start; - unsigned long end; -}; - struct var_mtrr_range_state { unsigned long base_pfn; unsigned long size_pfn; @@ -56,7 +51,7 @@ struct var_mtrr_state { /* Should be related to MTRR_VAR_RANGES nums */ #define RANGE_NUM 256 -static struct res_range __initdata range[RANGE_NUM]; +static struct range __initdata range[RANGE_NUM]; static int __initdata nr_range; static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; @@ -64,152 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; static int __initdata debug_print; #define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) - -static int __init -add_range(struct res_range *range, int nr_range, - unsigned long start, unsigned long end) -{ - /* Out of slots: */ - if (nr_range >= RANGE_NUM) - return nr_range; - - range[nr_range].start = start; - range[nr_range].end = end; - - nr_range++; - - return nr_range; -} - -static int __init -add_range_with_merge(struct res_range *range, int nr_range, - unsigned long start, unsigned long end) -{ - int i; - - /* Try to merge it with old one: */ - for (i = 0; i < nr_range; i++) { - unsigned long final_start, final_end; - unsigned long common_start, common_end; - - if (!range[i].end) - continue; - - common_start = max(range[i].start, start); - common_end = min(range[i].end, end); - if (common_start > common_end + 1) - continue; - - final_start = min(range[i].start, start); - final_end = max(range[i].end, end); - - range[i].start = final_start; - range[i].end = final_end; - return nr_range; - } - - /* Need to add it: */ - return add_range(range, nr_range, start, end); -} - -static void __init -subtract_range(struct res_range *range, unsigned long start, unsigned long end) -{ - int i, j; - - for (j = 0; j < RANGE_NUM; j++) { - if (!range[j].end) - continue; - - if (start <= range[j].start && end >= range[j].end) { - range[j].start = 0; - range[j].end = 0; - continue; - } - - if (start <= range[j].start && end < range[j].end && - range[j].start < end + 1) { - range[j].start = end + 1; - continue; - } - - - if (start > range[j].start && end >= range[j].end && - range[j].end > start - 1) { - range[j].end = start - 1; - continue; - } - - if (start > range[j].start && end < range[j].end) { - /* Find the new spare: */ - for (i = 0; i < RANGE_NUM; i++) { - if (range[i].end == 0) - break; - } - if (i < RANGE_NUM) { - range[i].end = range[j].end; - range[i].start = end + 1; - } else { - printk(KERN_ERR "run of slot in ranges\n"); - } - range[j].end = start - 1; - continue; - } - } -} - -static int __init cmp_range(const void *x1, const void *x2) -{ - const struct res_range *r1 = x1; - const struct res_range *r2 = x2; - long start1, start2; - - start1 = r1->start; - start2 = r2->start; - - return start1 - start2; -} - -static int __init clean_sort_range(struct res_range *range, int az) -{ - int i, j, k = az - 1, nr_range = 0; - - for (i = 0; i < k; i++) { - if (range[i].end) - continue; - for (j = k; j > i; j--) { - if (range[j].end) { - k = j; - break; - } - } - if (j == i) - break; - range[i].start = range[k].start; - range[i].end = range[k].end; - range[k].start = 0; - range[k].end = 0; - k--; - } - /* count it */ - for (i = 0; i < az; i++) { - if (!range[i].end) { - nr_range = i; - break; - } - } - - /* sort them */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); - - return nr_range; -} - #define BIOS_BUG_MSG KERN_WARNING \ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" static int __init -x86_get_mtrr_mem_range(struct res_range *range, int nr_range, +x86_get_mtrr_mem_range(struct range *range, int nr_range, unsigned long extra_remove_base, unsigned long extra_remove_size) { @@ -223,14 +77,14 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, continue; base = range_state[i].base_pfn; size = range_state[i].size_pfn; - nr_range = add_range_with_merge(range, nr_range, base, - base + size - 1); + nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, + base, base + size); } if (debug_print) { printk(KERN_DEBUG "After WB checking\n"); for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", - range[i].start, range[i].end + 1); + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); } /* Take out UC ranges: */ @@ -252,19 +106,19 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, size -= (1<<(20-PAGE_SHIFT)) - base; base = 1<<(20-PAGE_SHIFT); } - subtract_range(range, base, base + size - 1); + subtract_range(range, RANGE_NUM, base, base + size); } if (extra_remove_size) - subtract_range(range, extra_remove_base, - extra_remove_base + extra_remove_size - 1); + subtract_range(range, RANGE_NUM, extra_remove_base, + extra_remove_base + extra_remove_size); if (debug_print) { printk(KERN_DEBUG "After UC checking\n"); for (i = 0; i < RANGE_NUM; i++) { if (!range[i].end) continue; - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", - range[i].start, range[i].end + 1); + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); } } @@ -273,26 +127,22 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, if (debug_print) { printk(KERN_DEBUG "After sorting\n"); for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", - range[i].start, range[i].end + 1); + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); } - /* clear those is not used */ - for (i = nr_range; i < RANGE_NUM; i++) - memset(&range[i], 0, sizeof(range[i])); - return nr_range; } #ifdef CONFIG_MTRR_SANITIZER -static unsigned long __init sum_ranges(struct res_range *range, int nr_range) +static unsigned long __init sum_ranges(struct range *range, int nr_range) { unsigned long sum = 0; int i; for (i = 0; i < nr_range; i++) - sum += range[i].end + 1 - range[i].start; + sum += range[i].end - range[i].start; return sum; } @@ -621,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg) early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); static int __init -x86_setup_var_mtrrs(struct res_range *range, int nr_range, +x86_setup_var_mtrrs(struct range *range, int nr_range, u64 chunk_size, u64 gran_size) { struct var_mtrr_state var_state; @@ -639,7 +489,7 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, /* Write the range: */ for (i = 0; i < nr_range; i++) { set_var_mtrr_range(&var_state, range[i].start, - range[i].end - range[i].start + 1); + range[i].end - range[i].start); } /* Write the last range: */ @@ -742,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size, unsigned long x_remove_base, unsigned long x_remove_size, int i) { - static struct res_range range_new[RANGE_NUM]; + static struct range range_new[RANGE_NUM]; unsigned long range_sums_new; static int nr_range_new; int num_reg; @@ -869,10 +719,10 @@ int __init mtrr_cleanup(unsigned address_bits) * [0, 1M) should always be covered by var mtrr with WB * and fixed mtrrs should take effect before var mtrr for it: */ - nr_range = add_range_with_merge(range, nr_range, 0, - (1ULL<<(20 - PAGE_SHIFT)) - 1); + nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0, + 1ULL<<(20 - PAGE_SHIFT)); /* Sort the ranges: */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + sort_range(range, nr_range); range_sums = sum_ranges(range, nr_range); printk(KERN_INFO "total RAM covered: %ldM\n", @@ -1089,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) nr_range = 0; if (mtrr_tom2) { range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); - range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; - if (highest_pfn < range[nr_range].end + 1) - highest_pfn = range[nr_range].end + 1; + range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT; + if (highest_pfn < range[nr_range].end) + highest_pfn = range[nr_range].end; nr_range++; } nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); @@ -1103,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* Check the holes: */ for (i = 0; i < nr_range - 1; i++) { - if (range[i].end + 1 < range[i+1].start) - total_trim_size += real_trim_memory(range[i].end + 1, + if (range[i].end < range[i+1].start) + total_trim_size += real_trim_memory(range[i].end, range[i+1].start); } /* Check the top: */ i = nr_range - 1; - if (range[i].end + 1 < end_pfn) - total_trim_size += real_trim_memory(range[i].end + 1, + if (range[i].end < end_pfn) + total_trim_size += real_trim_memory(range[i].end, end_pfn); if (total_trim_size) { diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 228d982ce09..68a3343e579 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -265,7 +265,7 @@ static void cyrix_set_all(void) post_set(); } -static struct mtrr_ops cyrix_mtrr_ops = { +static const struct mtrr_ops cyrix_mtrr_ops = { .vendor = X86_VENDOR_CYRIX, .set_all = cyrix_set_all, .set = cyrix_set_arr, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 55da0c5f68d..fd31a441c61 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -6,7 +6,6 @@ #include <linux/module.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/io.h> #include <linux/mm.h> @@ -464,7 +463,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, tmp |= ~((1<<(hi - 1)) - 1); if (tmp != mask_lo) { - WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); + printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); mask_lo = tmp; } } @@ -570,7 +569,7 @@ static unsigned long set_mtrr_state(void) static unsigned long cr4; -static DEFINE_SPINLOCK(set_atomicity_lock); +static DEFINE_RAW_SPINLOCK(set_atomicity_lock); /* * Since we are disabling the cache don't allow any interrupts, @@ -590,7 +589,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) * changes to the way the kernel boots */ - spin_lock(&set_atomicity_lock); + raw_spin_lock(&set_atomicity_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; @@ -627,7 +626,7 @@ static void post_set(void) __releases(set_atomicity_lock) /* Restore value of CR4 */ if (cpu_has_pge) write_cr4(cr4); - spin_unlock(&set_atomicity_lock); + raw_spin_unlock(&set_atomicity_lock); } static void generic_set_all(void) @@ -752,7 +751,7 @@ int positive_have_wrcomb(void) /* * Generic structure... */ -struct mtrr_ops generic_mtrr_ops = { +const struct mtrr_ops generic_mtrr_ops = { .use_intel_if = 1, .set_all = generic_set_all, .get = generic_get_mtrr, diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index e006e56f699..79289632cb2 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -5,6 +5,7 @@ #include <linux/module.h> #include <linux/ctype.h> #include <linux/string.h> +#include <linux/slab.h> #include <linux/init.h> #define LINE_SIZE 80 diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 84e83de5457..79556bd9b60 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -60,14 +60,14 @@ static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; static bool mtrr_aps_delayed_init; -static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; +static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; -struct mtrr_ops *mtrr_if; +const struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -void set_mtrr_ops(struct mtrr_ops *ops) +void set_mtrr_ops(const struct mtrr_ops *ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) mtrr_ops[ops->vendor] = ops; @@ -145,6 +145,7 @@ struct set_mtrr_data { /** * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * @info: pointer to mtrr configuration data * * Returns nothing. */ diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index a501dee9a87..df5e41f31a2 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -32,7 +32,7 @@ extern int generic_get_free_region(unsigned long base, unsigned long size, extern int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type); -extern struct mtrr_ops generic_mtrr_ops; +extern const struct mtrr_ops generic_mtrr_ops; extern int positive_have_wrcomb(void); @@ -53,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); void get_mtrr_state(void); -extern void set_mtrr_ops(struct mtrr_ops *ops); +extern void set_mtrr_ops(const struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; -extern struct mtrr_ops *mtrr_if; +extern const struct mtrr_ops *mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c deleted file mode 100644 index dfc80b4e6b0..00000000000 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ /dev/null @@ -1,94 +0,0 @@ -#include <linux/init.h> -#include <linux/io.h> -#include <linux/mm.h> - -#include <asm/processor-cyrix.h> -#include <asm/processor-flags.h> -#include <asm/mtrr.h> -#include <asm/msr.h> - -#include "mtrr.h" - -/* Put the processor into a state where MTRRs can be safely set */ -void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) -{ - unsigned int cr0; - - /* Disable interrupts locally */ - local_irq_save(ctxt->flags); - - if (use_intel() || is_cpu(CYRIX)) { - - /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if (cpu_has_pge) { - ctxt->cr4val = read_cr4(); - write_cr4(ctxt->cr4val & ~X86_CR4_PGE); - } - - /* - * Disable and flush caches. Note that wbinvd flushes the TLBs - * as a side-effect - */ - cr0 = read_cr0() | X86_CR0_CD; - wbinvd(); - write_cr0(cr0); - wbinvd(); - - if (use_intel()) { - /* Save MTRR state */ - rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); - } else { - /* - * Cyrix ARRs - - * everything else were excluded at the top - */ - ctxt->ccr3 = getCx86(CX86_CCR3); - } - } -} - -void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) -{ - if (use_intel()) { - /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, - ctxt->deftype_hi); - } else { - if (is_cpu(CYRIX)) { - /* Cyrix ARRs - everything else were excluded at the top */ - setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); - } - } -} - -/* Restore the processor after a set_mtrr_prepare */ -void set_mtrr_done(struct set_mtrr_context *ctxt) -{ - if (use_intel() || is_cpu(CYRIX)) { - - /* Flush caches and TLBs */ - wbinvd(); - - /* Restore MTRRdefType */ - if (use_intel()) { - /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, - ctxt->deftype_hi); - } else { - /* - * Cyrix ARRs - - * everything else was excluded at the top - */ - setCx86(CX86_CCR3, ctxt->ccr3); - } - - /* Enable caches */ - write_cr0(read_cr0() & 0xbfffffff); - - /* Restore value of CR4 */ - if (cpu_has_pge) - write_cr4(ctxt->cr4val); - } - /* Re-enable interrupts locally (if enabled previously) */ - local_irq_restore(ctxt->flags); -} diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c223b7e895d..db5bdc8addf 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -7,6 +7,7 @@ * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> + * Copyright (C) 2009 Google, Inc., Stephane Eranian * * For licencing details see kernel-base/COPYING */ @@ -20,12 +21,15 @@ #include <linux/kdebug.h> #include <linux/sched.h> #include <linux/uaccess.h> +#include <linux/slab.h> #include <linux/highmem.h> #include <linux/cpu.h> +#include <linux/bitops.h> #include <asm/apic.h> #include <asm/stacktrace.h> #include <asm/nmi.h> +#include <asm/compat.h> static u64 perf_event_mask __read_mostly; @@ -68,26 +72,59 @@ struct debug_store { u64 pebs_event_reset[MAX_PEBS_EVENTS]; }; +struct event_constraint { + union { + unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + u64 idxmsk64; + }; + u64 code; + u64 cmask; + int weight; +}; + +struct amd_nb { + int nb_id; /* NorthBridge id */ + int refcnt; /* reference count */ + struct perf_event *owners[X86_PMC_IDX_MAX]; + struct event_constraint event_constraints[X86_PMC_IDX_MAX]; +}; + struct cpu_hw_events { - struct perf_event *events[X86_PMC_IDX_MAX]; - unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; int enabled; struct debug_store *ds; -}; -struct event_constraint { - unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - int code; + int n_events; + int n_added; + int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ + u64 tags[X86_PMC_IDX_MAX]; + struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ + struct amd_nb *amd_nb; }; -#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) } -#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 } +#define __EVENT_CONSTRAINT(c, n, m, w) {\ + { .idxmsk64 = (n) }, \ + .code = (c), \ + .cmask = (m), \ + .weight = (w), \ +} + +#define EVENT_CONSTRAINT(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) -#define for_each_event_constraint(e, c) \ - for ((e) = (c); (e)->idxmsk[0]; (e)++) +#define INTEL_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) +#define FIXED_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK) + +#define EVENT_CONSTRAINT_END \ + EVENT_CONSTRAINT(0, 0, 0) + +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->cmask; (e)++) /* * struct x86_pmu - generic x86 pmu @@ -98,8 +135,8 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *); void (*disable_all)(void); void (*enable_all)(void); - void (*enable)(struct hw_perf_event *, int); - void (*disable)(struct hw_perf_event *, int); + void (*enable)(struct perf_event *); + void (*disable)(struct perf_event *); unsigned eventsel; unsigned perfctr; u64 (*event_map)(int); @@ -114,121 +151,28 @@ struct x86_pmu { u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); - int (*get_event_idx)(struct cpu_hw_events *cpuc, - struct hw_perf_event *hwc); -}; -static struct x86_pmu x86_pmu __read_mostly; + struct event_constraint * + (*get_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); -static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { - .enabled = 1, -}; + void (*put_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); + struct event_constraint *event_constraints; -static const struct event_constraint *event_constraints; - -/* - * Not sure about some of these - */ -static const u64 p6_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, -}; - -static u64 p6_pmu_event_map(int hw_event) -{ - return p6_perfmon_event_map[hw_event]; -} - -/* - * Event setting that is specified not to count anything. - * We use this to effectively disable a counter. - * - * L2_RQSTS with 0 MESI unit mask. - */ -#define P6_NOP_EVENT 0x0000002EULL - -static u64 p6_pmu_raw_event(u64 hw_event) -{ -#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL -#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL -#define P6_EVNTSEL_INV_MASK 0x00800000ULL -#define P6_EVNTSEL_REG_MASK 0xFF000000ULL - -#define P6_EVNTSEL_MASK \ - (P6_EVNTSEL_EVENT_MASK | \ - P6_EVNTSEL_UNIT_MASK | \ - P6_EVNTSEL_EDGE_MASK | \ - P6_EVNTSEL_INV_MASK | \ - P6_EVNTSEL_REG_MASK) - - return hw_event & P6_EVNTSEL_MASK; -} - -static const struct event_constraint intel_p6_event_constraints[] = -{ - EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ - EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ - EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - EVENT_CONSTRAINT_END -}; - -/* - * Intel PerfMon v3. Used on Core2 and later. - */ -static const u64 intel_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, + int (*cpu_prepare)(int cpu); + void (*cpu_starting)(int cpu); + void (*cpu_dying)(int cpu); + void (*cpu_dead)(int cpu); }; -static const struct event_constraint intel_core_event_constraints[] = -{ - EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ - EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ - EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ - EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ - EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ - EVENT_CONSTRAINT_END -}; +static struct x86_pmu x86_pmu __read_mostly; -static const struct event_constraint intel_nehalem_event_constraints[] = -{ - EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ - EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ - EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ - EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ - EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ - EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */ - EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ - EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ - EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */ - EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */ - EVENT_CONSTRAINT_END +static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { + .enabled = 1, }; -static u64 intel_pmu_event_map(int hw_event) -{ - return intel_perfmon_event_map[hw_event]; -} +static int x86_perf_event_set_period(struct perf_event *event); /* * Generalized hw caching related hw_event table, filled @@ -245,435 +189,18 @@ static u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -static __initconst u64 nehalem_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static __initconst u64 core2_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static __initconst u64 atom_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static u64 intel_pmu_raw_event(u64 hw_event) -{ -#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL -#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL -#define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL - -#define CORE_EVNTSEL_MASK \ - (CORE_EVNTSEL_EVENT_MASK | \ - CORE_EVNTSEL_UNIT_MASK | \ - CORE_EVNTSEL_EDGE_MASK | \ - CORE_EVNTSEL_INV_MASK | \ - CORE_EVNTSEL_REG_MASK) - - return hw_event & CORE_EVNTSEL_MASK; -} - -static __initconst u64 amd_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ - [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ - [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ - [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ - [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ - [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -/* - * AMD Performance Monitor K7 and later. - */ -static const u64 amd_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, -}; - -static u64 amd_pmu_event_map(int hw_event) -{ - return amd_perfmon_event_map[hw_event]; -} - -static u64 amd_pmu_raw_event(u64 hw_event) -{ -#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL -#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL -#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL -#define K7_EVNTSEL_INV_MASK 0x000800000ULL -#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL - -#define K7_EVNTSEL_MASK \ - (K7_EVNTSEL_EVENT_MASK | \ - K7_EVNTSEL_UNIT_MASK | \ - K7_EVNTSEL_EDGE_MASK | \ - K7_EVNTSEL_INV_MASK | \ - K7_EVNTSEL_REG_MASK) - - return hw_event & K7_EVNTSEL_MASK; -} - /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. * Returns the delta events processed. */ static u64 -x86_perf_event_update(struct perf_event *event, - struct hw_perf_event *hwc, int idx) +x86_perf_event_update(struct perf_event *event) { + struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.event_bits; u64 prev_raw_count, new_raw_count; + int idx = hwc->idx; s64 delta; if (idx == X86_PMC_IDX_FIXED_BTS) @@ -773,7 +300,7 @@ static inline bool bts_available(void) return x86_pmu.enable_bts != NULL; } -static inline void init_debug_store_on_cpu(int cpu) +static void init_debug_store_on_cpu(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -785,7 +312,7 @@ static inline void init_debug_store_on_cpu(int cpu) (u32)((u64)(unsigned long)ds >> 32)); } -static inline void fini_debug_store_on_cpu(int cpu) +static void fini_debug_store_on_cpu(int cpu) { if (!per_cpu(cpu_hw_events, cpu).ds) return; @@ -914,42 +441,6 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) return 0; } -static void intel_pmu_enable_bts(u64 config) -{ - unsigned long debugctlmsr; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr |= X86_DEBUGCTL_TR; - debugctlmsr |= X86_DEBUGCTL_BTS; - debugctlmsr |= X86_DEBUGCTL_BTINT; - - if (!(config & ARCH_PERFMON_EVENTSEL_OS)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; - - if (!(config & ARCH_PERFMON_EVENTSEL_USR)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; - - update_debugctlmsr(debugctlmsr); -} - -static void intel_pmu_disable_bts(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - unsigned long debugctlmsr; - - if (!cpuc->ds) - return; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr &= - ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | - X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); - - update_debugctlmsr(debugctlmsr); -} - /* * Setup the hardware configuration for a given attr_type */ @@ -988,6 +479,8 @@ static int __hw_perf_event_init(struct perf_event *event) hwc->config = ARCH_PERFMON_EVENTSEL_INT; hwc->idx = -1; + hwc->last_cpu = -1; + hwc->last_tag = ~0ULL; /* * Count user and OS events unless requested not to. @@ -1017,6 +510,9 @@ static int __hw_perf_event_init(struct perf_event *event) */ if (attr->type == PERF_TYPE_RAW) { hwc->config |= x86_pmu.raw_event(attr->config); + if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) && + perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return -EACCES; return 0; } @@ -1056,216 +552,314 @@ static int __hw_perf_event_init(struct perf_event *event) return 0; } -static void p6_pmu_disable_all(void) +static void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val; - - if (!cpuc->enabled) - return; + int idx; - cpuc->enabled = 0; - barrier(); + for (idx = 0; idx < x86_pmu.num_events; idx++) { + u64 val; - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); + if (!test_bit(idx, cpuc->active_mask)) + continue; + rdmsrl(x86_pmu.eventsel + idx, val); + if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) + continue; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(x86_pmu.eventsel + idx, val); + } } -static void intel_pmu_disable_all(void) +void hw_perf_disable(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + if (!x86_pmu_initialized()) + return; + if (!cpuc->enabled) return; + cpuc->n_added = 0; cpuc->enabled = 0; barrier(); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) - intel_pmu_disable_bts(); + x86_pmu.disable_all(); } -static void amd_pmu_disable_all(void) +static void x86_pmu_enable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - /* - * ensure we write the disable before we start disabling the - * events proper, so that amd_pmu_enable_event() does the - * right thing. - */ - barrier(); - for (idx = 0; idx < x86_pmu.num_events; idx++) { + struct perf_event *event = cpuc->events[idx]; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) - continue; - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + + val = event->hw.config; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(x86_pmu.eventsel + idx, val); } } -void hw_perf_disable(void) +static const struct pmu pmu; + +static inline int is_x86_event(struct perf_event *event) { - if (!x86_pmu_initialized()) - return; - return x86_pmu.disable_all(); + return event->pmu == &pmu; } -static void p6_pmu_enable_all(void) +static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - unsigned long val; + struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int i, j, w, wmax, num = 0; + struct hw_perf_event *hwc; - if (cpuc->enabled) - return; + bitmap_zero(used_mask, X86_PMC_IDX_MAX); - cpuc->enabled = 1; - barrier(); + for (i = 0; i < n; i++) { + c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); + constraints[i] = c; + } - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); -} + /* + * fastpath, try to reuse previous register + */ + for (i = 0; i < n; i++) { + hwc = &cpuc->event_list[i]->hw; + c = constraints[i]; -static void intel_pmu_enable_all(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + /* never assigned */ + if (hwc->idx == -1) + break; - if (cpuc->enabled) - return; + /* constraint still honored */ + if (!test_bit(hwc->idx, c->idxmsk)) + break; - cpuc->enabled = 1; - barrier(); + /* not already used */ + if (test_bit(hwc->idx, used_mask)) + break; - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + __set_bit(hwc->idx, used_mask); + if (assign) + assign[i] = hwc->idx; + } + if (i == n) + goto done; - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { - struct perf_event *event = - cpuc->events[X86_PMC_IDX_FIXED_BTS]; + /* + * begin slow path + */ - if (WARN_ON_ONCE(!event)) - return; + bitmap_zero(used_mask, X86_PMC_IDX_MAX); - intel_pmu_enable_bts(event->hw.config); - } -} + /* + * weight = number of possible counters + * + * 1 = most constrained, only works on one counter + * wmax = least constrained, works on any counter + * + * assign events to counters starting with most + * constrained events. + */ + wmax = x86_pmu.num_events; -static void amd_pmu_enable_all(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - int idx; + /* + * when fixed event counters are present, + * wmax is incremented by 1 to account + * for one more choice + */ + if (x86_pmu.num_events_fixed) + wmax++; - if (cpuc->enabled) - return; + for (w = 1, num = n; num && w <= wmax; w++) { + /* for each event */ + for (i = 0; num && i < n; i++) { + c = constraints[i]; + hwc = &cpuc->event_list[i]->hw; - cpuc->enabled = 1; - barrier(); + if (c->weight != w) + continue; - for (idx = 0; idx < x86_pmu.num_events; idx++) { - struct perf_event *event = cpuc->events[idx]; - u64 val; + for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { + if (!test_bit(j, used_mask)) + break; + } - if (!test_bit(idx, cpuc->active_mask)) - continue; + if (j == X86_PMC_IDX_MAX) + break; - val = event->hw.config; - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + __set_bit(j, used_mask); + + if (assign) + assign[i] = j; + num--; + } } +done: + /* + * scheduling failed or is just a simulation, + * free resources if necessary + */ + if (!assign || num) { + for (i = 0; i < n; i++) { + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]); + } + } + return num ? -ENOSPC : 0; } -void hw_perf_enable(void) +/* + * dogrp: true if must collect siblings events (group) + * returns total number of events and error code + */ +static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) { - if (!x86_pmu_initialized()) - return; - x86_pmu.enable_all(); -} + struct perf_event *event; + int n, max_count; -static inline u64 intel_pmu_get_status(void) -{ - u64 status; + max_count = x86_pmu.num_events + x86_pmu.num_events_fixed; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + /* current number of events already accepted */ + n = cpuc->n_events; - return status; -} + if (is_x86_event(leader)) { + if (n >= max_count) + return -ENOSPC; + cpuc->event_list[n] = leader; + n++; + } + if (!dogrp) + return n; -static inline void intel_pmu_ack_status(u64 ack) -{ - wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); -} + list_for_each_entry(event, &leader->sibling_list, group_entry) { + if (!is_x86_event(event) || + event->state <= PERF_EVENT_STATE_OFF) + continue; -static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) -{ - (void)checking_wrmsrl(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); -} + if (n >= max_count) + return -ENOSPC; -static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) -{ - (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); + cpuc->event_list[n] = event; + n++; + } + return n; } -static inline void -intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) +static inline void x86_assign_hw_event(struct perf_event *event, + struct cpu_hw_events *cpuc, int i) { - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, mask; + struct hw_perf_event *hwc = &event->hw; - mask = 0xfULL << (idx * 4); + hwc->idx = cpuc->assign[i]; + hwc->last_cpu = smp_processor_id(); + hwc->last_tag = ++cpuc->tags[i]; - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - (void)checking_wrmsrl(hwc->config_base, ctrl_val); + if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { + hwc->config_base = 0; + hwc->event_base = 0; + } else if (hwc->idx >= X86_PMC_IDX_FIXED) { + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + /* + * We set it so that event_base + idx in wrmsr/rdmsr maps to + * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: + */ + hwc->event_base = + MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; + } else { + hwc->config_base = x86_pmu.eventsel; + hwc->event_base = x86_pmu.perfctr; + } } -static inline void -p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) +static inline int match_prev_assignment(struct hw_perf_event *hwc, + struct cpu_hw_events *cpuc, + int i) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val = P6_NOP_EVENT; - - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + idx, val); + return hwc->idx == cpuc->assign[i] && + hwc->last_cpu == smp_processor_id() && + hwc->last_tag == cpuc->tags[i]; } -static inline void -intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) +static int x86_pmu_start(struct perf_event *event); +static void x86_pmu_stop(struct perf_event *event); + +void hw_perf_enable(void) { - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - intel_pmu_disable_bts(); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_event *event; + struct hw_perf_event *hwc; + int i; + + if (!x86_pmu_initialized()) return; - } - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_disable_fixed(hwc, idx); + if (cpuc->enabled) return; + + if (cpuc->n_added) { + int n_running = cpuc->n_events - cpuc->n_added; + /* + * apply assignment obtained either from + * hw_perf_group_sched_in() or x86_pmu_enable() + * + * step1: save events moving to new counters + * step2: reprogram moved events into new counters + */ + for (i = 0; i < n_running; i++) { + event = cpuc->event_list[i]; + hwc = &event->hw; + + /* + * we can avoid reprogramming counter if: + * - assigned same counter as last time + * - running on same CPU as last time + * - no other event has used the counter since + */ + if (hwc->idx == -1 || + match_prev_assignment(hwc, cpuc, i)) + continue; + + x86_pmu_stop(event); + } + + for (i = 0; i < cpuc->n_events; i++) { + event = cpuc->event_list[i]; + hwc = &event->hw; + + if (!match_prev_assignment(hwc, cpuc, i)) + x86_assign_hw_event(event, cpuc, i); + else if (i < n_running) + continue; + + x86_pmu_start(event); + } + cpuc->n_added = 0; + perf_events_lapic_init(); } - x86_pmu_disable_event(hwc, idx); + cpuc->enabled = 1; + barrier(); + + x86_pmu.enable_all(); } -static inline void -amd_pmu_disable_event(struct hw_perf_event *hwc, int idx) +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc) { - x86_pmu_disable_event(hwc, idx); + (void)checking_wrmsrl(hwc->config_base + hwc->idx, + hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE); +} + +static inline void x86_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config); } static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); @@ -1275,12 +869,12 @@ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); * To be called with the event disabled in hw: */ static int -x86_perf_event_set_period(struct perf_event *event, - struct hw_perf_event *hwc, int idx) +x86_perf_event_set_period(struct perf_event *event) { + struct hw_perf_event *hwc = &event->hw; s64 left = atomic64_read(&hwc->period_left); s64 period = hwc->sample_period; - int err, ret = 0; + int err, ret = 0, idx = hwc->idx; if (idx == X86_PMC_IDX_FIXED_BTS) return 0; @@ -1326,212 +920,63 @@ x86_perf_event_set_period(struct perf_event *event, return ret; } -static inline void -intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) -{ - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, bits, mask; - int err; - - /* - * Enable IRQ generation (0x8), - * and enable ring-3 counting (0x2) and ring-0 counting (0x1) - * if requested: - */ - bits = 0x8ULL; - if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) - bits |= 0x2; - if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) - bits |= 0x1; - bits <<= (idx * 4); - mask = 0xfULL << (idx * 4); - - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - ctrl_val |= bits; - err = checking_wrmsrl(hwc->config_base, ctrl_val); -} - -static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void x86_pmu_enable_event(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val; - - val = hwc->config; if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + idx, val); + __x86_pmu_enable_event(&event->hw); } - -static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) -{ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - if (!__get_cpu_var(cpu_hw_events).enabled) - return; - - intel_pmu_enable_bts(hwc->config); - return; - } - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_enable_fixed(hwc, idx); - return; - } - - x86_pmu_enable_event(hwc, idx); -} - -static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) +/* + * activate a single event + * + * The event is added to the group of enabled events + * but only if it can be scehduled with existing events. + * + * Called with PMU disabled. If successful and return value 1, + * then guaranteed to call perf_enable() and hw_perf_enable() + */ +static int x86_pmu_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc; + int assign[X86_PMC_IDX_MAX]; + int n, n0, ret; - if (cpuc->enabled) - x86_pmu_enable_event(hwc, idx); -} - -static int fixed_mode_idx(struct hw_perf_event *hwc) -{ - unsigned int hw_event; - - hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; - - if (unlikely((hw_event == - x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && - (hwc->sample_period == 1))) - return X86_PMC_IDX_FIXED_BTS; + hwc = &event->hw; - if (!x86_pmu.num_events_fixed) - return -1; + n0 = cpuc->n_events; + n = collect_events(cpuc, event, false); + if (n < 0) + return n; + ret = x86_schedule_events(cpuc, n, assign); + if (ret) + return ret; /* - * fixed counters do not take all possible filters + * copy new assignment, now we know it is possible + * will be used by hw_perf_enable() */ - if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK) - return -1; - - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) - return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) - return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) - return X86_PMC_IDX_FIXED_BUS_CYCLES; - - return -1; -} - -/* - * generic counter allocator: get next free counter - */ -static int -gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) -{ - int idx; - - idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events); - return idx == x86_pmu.num_events ? -1 : idx; -} + memcpy(cpuc->assign, assign, n*sizeof(int)); -/* - * intel-specific counter allocator: check event constraints - */ -static int -intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) -{ - const struct event_constraint *event_constraint; - int i, code; + cpuc->n_events = n; + cpuc->n_added += n - n0; - if (!event_constraints) - goto skip; - - code = hwc->config & CORE_EVNTSEL_EVENT_MASK; - - for_each_event_constraint(event_constraint, event_constraints) { - if (code == event_constraint->code) { - for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) { - if (!test_and_set_bit(i, cpuc->used_mask)) - return i; - } - return -1; - } - } -skip: - return gen_get_event_idx(cpuc, hwc); -} - -static int -x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) -{ - int idx; - - idx = fixed_mode_idx(hwc); - if (idx == X86_PMC_IDX_FIXED_BTS) { - /* BTS is already occupied. */ - if (test_and_set_bit(idx, cpuc->used_mask)) - return -EAGAIN; - - hwc->config_base = 0; - hwc->event_base = 0; - hwc->idx = idx; - } else if (idx >= 0) { - /* - * Try to get the fixed event, if that is already taken - * then try to get a generic event: - */ - if (test_and_set_bit(idx, cpuc->used_mask)) - goto try_generic; - - hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - /* - * We set it so that event_base + idx in wrmsr/rdmsr maps to - * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: - */ - hwc->event_base = - MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; - hwc->idx = idx; - } else { - idx = hwc->idx; - /* Try to get the previous generic event again */ - if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) { -try_generic: - idx = x86_pmu.get_event_idx(cpuc, hwc); - if (idx == -1) - return -EAGAIN; - - set_bit(idx, cpuc->used_mask); - hwc->idx = idx; - } - hwc->config_base = x86_pmu.eventsel; - hwc->event_base = x86_pmu.perfctr; - } - - return idx; + return 0; } -/* - * Find a PMC slot for the freshly enabled / scheduled in event: - */ -static int x86_pmu_enable(struct perf_event *event) +static int x86_pmu_start(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - int idx; - - idx = x86_schedule_event(cpuc, hwc); - if (idx < 0) - return idx; - - perf_events_lapic_init(); + int idx = event->hw.idx; - x86_pmu.disable(hwc, idx); + if (idx == -1) + return -EAGAIN; + x86_perf_event_set_period(event); cpuc->events[idx] = event; - set_bit(idx, cpuc->active_mask); - - x86_perf_event_set_period(event, hwc, idx); - x86_pmu.enable(hwc, idx); - + __set_bit(idx, cpuc->active_mask); + x86_pmu.enable(event); perf_event_update_userpage(event); return 0; @@ -1539,14 +984,8 @@ static int x86_pmu_enable(struct perf_event *event) static void x86_pmu_unthrottle(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - - if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || - cpuc->events[hwc->idx] != event)) - return; - - x86_pmu.enable(hwc, hwc->idx); + int ret = x86_pmu_start(event); + WARN_ON_ONCE(ret); } void perf_event_print_debug(void) @@ -1576,7 +1015,7 @@ void perf_event_print_debug(void) pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); } - pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); + pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); for (idx = 0; idx < x86_pmu.num_events; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); @@ -1600,257 +1039,50 @@ void perf_event_print_debug(void) local_irq_restore(flags); } -static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) -{ - struct debug_store *ds = cpuc->ds; - struct bts_record { - u64 from; - u64 to; - u64 flags; - }; - struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; - struct bts_record *at, *top; - struct perf_output_handle handle; - struct perf_event_header header; - struct perf_sample_data data; - struct pt_regs regs; - - if (!event) - return; - - if (!ds) - return; - - at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; - top = (struct bts_record *)(unsigned long)ds->bts_index; - - if (top <= at) - return; - - ds->bts_index = ds->bts_buffer_base; - - - data.period = event->hw.last_period; - data.addr = 0; - data.raw = NULL; - regs.ip = 0; - - /* - * Prepare a generic sample, i.e. fill in the invariant fields. - * We will overwrite the from and to address before we output - * the sample. - */ - perf_prepare_sample(&header, &data, event, ®s); - - if (perf_output_begin(&handle, event, - header.size * (top - at), 1, 1)) - return; - - for (; at < top; at++) { - data.ip = at->from; - data.addr = at->to; - - perf_output_sample(&handle, &header, &data, event); - } - - perf_output_end(&handle); - - /* There's new data available. */ - event->hw.interrupts++; - event->pending_kill = POLL_IN; -} - -static void x86_pmu_disable(struct perf_event *event) +static void x86_pmu_stop(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - /* - * Must be done before we disable, otherwise the nmi handler - * could reenable again: - */ - clear_bit(idx, cpuc->active_mask); - x86_pmu.disable(hwc, idx); + if (!__test_and_clear_bit(idx, cpuc->active_mask)) + return; - /* - * Make sure the cleared pointer becomes visible before we - * (potentially) free the event: - */ - barrier(); + x86_pmu.disable(event); /* * Drain the remaining delta count out of a event * that we are disabling: */ - x86_perf_event_update(event, hwc, idx); - - /* Drain the remaining BTS records. */ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) - intel_pmu_drain_bts_buffer(cpuc); + x86_perf_event_update(event); cpuc->events[idx] = NULL; - clear_bit(idx, cpuc->used_mask); - - perf_event_update_userpage(event); -} - -/* - * Save and restart an expired event. Called by NMI contexts, - * so it has to be careful about preempting normal event ops: - */ -static int intel_pmu_save_and_restart(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - int ret; - - x86_perf_event_update(event, hwc, idx); - ret = x86_perf_event_set_period(event, hwc, idx); - - if (event->state == PERF_EVENT_STATE_ACTIVE) - intel_pmu_enable_event(hwc, idx); - - return ret; -} - -static void intel_pmu_reset(void) -{ - struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; - unsigned long flags; - int idx; - - if (!x86_pmu.num_events) - return; - - local_irq_save(flags); - - printk("clearing PMU state on CPU#%d\n", smp_processor_id()); - - for (idx = 0; idx < x86_pmu.num_events; idx++) { - checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); - checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); - } - for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { - checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); - } - if (ds) - ds->bts_index = ds->bts_buffer_base; - - local_irq_restore(flags); -} - -static int p6_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_events *cpuc; - struct perf_event *event; - struct hw_perf_event *hwc; - int idx, handled = 0; - u64 val; - - data.addr = 0; - data.raw = NULL; - - cpuc = &__get_cpu_var(cpu_hw_events); - - for (idx = 0; idx < x86_pmu.num_events; idx++) { - if (!test_bit(idx, cpuc->active_mask)) - continue; - - event = cpuc->events[idx]; - hwc = &event->hw; - - val = x86_perf_event_update(event, hwc, idx); - if (val & (1ULL << (x86_pmu.event_bits - 1))) - continue; - - /* - * event overflow - */ - handled = 1; - data.period = event->hw.last_period; - - if (!x86_perf_event_set_period(event, hwc, idx)) - continue; - - if (perf_event_overflow(event, 1, &data, regs)) - p6_pmu_disable_event(hwc, idx); - } - - if (handled) - inc_irq_stat(apic_perf_irqs); - - return handled; } -/* - * This handler is triggered by the local APIC, so the APIC IRQ handling - * rules apply: - */ -static int intel_pmu_handle_irq(struct pt_regs *regs) +static void x86_pmu_disable(struct perf_event *event) { - struct perf_sample_data data; - struct cpu_hw_events *cpuc; - int bit, loops; - u64 ack, status; - - data.addr = 0; - data.raw = NULL; - - cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int i; - perf_disable(); - intel_pmu_drain_bts_buffer(cpuc); - status = intel_pmu_get_status(); - if (!status) { - perf_enable(); - return 0; - } + x86_pmu_stop(event); - loops = 0; -again: - if (++loops > 100) { - WARN_ONCE(1, "perfevents: irq loop stuck!\n"); - perf_event_print_debug(); - intel_pmu_reset(); - perf_enable(); - return 1; - } - - inc_irq_stat(apic_perf_irqs); - ack = status; - for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { - struct perf_event *event = cpuc->events[bit]; + for (i = 0; i < cpuc->n_events; i++) { + if (event == cpuc->event_list[i]) { - clear_bit(bit, (unsigned long *) &status); - if (!test_bit(bit, cpuc->active_mask)) - continue; + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, event); - if (!intel_pmu_save_and_restart(event)) - continue; + while (++i < cpuc->n_events) + cpuc->event_list[i-1] = cpuc->event_list[i]; - data.period = event->hw.last_period; - - if (perf_event_overflow(event, 1, &data, regs)) - intel_pmu_disable_event(&event->hw, bit); + --cpuc->n_events; + break; + } } - - intel_pmu_ack_status(ack); - - /* - * Repeat if there is more work to be done: - */ - status = intel_pmu_get_status(); - if (status) - goto again; - - perf_enable(); - - return 1; + perf_event_update_userpage(event); } -static int amd_pmu_handle_irq(struct pt_regs *regs) +static int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_events *cpuc; @@ -1859,8 +1091,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); @@ -1871,7 +1102,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) event = cpuc->events[idx]; hwc = &event->hw; - val = x86_perf_event_update(event, hwc, idx); + val = x86_perf_event_update(event); if (val & (1ULL << (x86_pmu.event_bits - 1))) continue; @@ -1881,11 +1112,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) handled = 1; data.period = event->hw.last_period; - if (!x86_perf_event_set_period(event, hwc, idx)) + if (!x86_perf_event_set_period(event)) continue; if (perf_event_overflow(event, 1, &data, regs)) - amd_pmu_disable_event(hwc, idx); + x86_pmu_stop(event); } if (handled) @@ -1968,193 +1199,171 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = { .priority = 1 }; -static __initconst struct x86_pmu p6_pmu = { - .name = "p6", - .handle_irq = p6_pmu_handle_irq, - .disable_all = p6_pmu_disable_all, - .enable_all = p6_pmu_enable_all, - .enable = p6_pmu_enable_event, - .disable = p6_pmu_disable_event, - .eventsel = MSR_P6_EVNTSEL0, - .perfctr = MSR_P6_PERFCTR0, - .event_map = p6_pmu_event_map, - .raw_event = p6_pmu_raw_event, - .max_events = ARRAY_SIZE(p6_perfmon_event_map), - .apic = 1, - .max_period = (1ULL << 31) - 1, - .version = 0, - .num_events = 2, - /* - * Events have 40 bits implemented. However they are designed such - * that bits [32-39] are sign extensions of bit 31. As such the - * effective width of a event for P6-like PMU is 32 bits only. - * - * See IA-32 Intel Architecture Software developer manual Vol 3B - */ - .event_bits = 32, - .event_mask = (1ULL << 32) - 1, - .get_event_idx = intel_get_event_idx, -}; +static struct event_constraint unconstrained; +static struct event_constraint emptyconstraint; -static __initconst struct x86_pmu intel_pmu = { - .name = "Intel", - .handle_irq = intel_pmu_handle_irq, - .disable_all = intel_pmu_disable_all, - .enable_all = intel_pmu_enable_all, - .enable = intel_pmu_enable_event, - .disable = intel_pmu_disable_event, - .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = intel_pmu_event_map, - .raw_event = intel_pmu_raw_event, - .max_events = ARRAY_SIZE(intel_perfmon_event_map), - .apic = 1, - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic event period: - */ - .max_period = (1ULL << 31) - 1, - .enable_bts = intel_pmu_enable_bts, - .disable_bts = intel_pmu_disable_bts, - .get_event_idx = intel_get_event_idx, -}; +static struct event_constraint * +x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c; -static __initconst struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = amd_pmu_handle_irq, - .disable_all = amd_pmu_disable_all, - .enable_all = amd_pmu_enable_all, - .enable = amd_pmu_enable_event, - .disable = amd_pmu_disable_event, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .raw_event = amd_pmu_raw_event, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_events = 4, - .event_bits = 48, - .event_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, - .get_event_idx = gen_get_event_idx, -}; + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &unconstrained; +} -static __init int p6_pmu_init(void) +static int x86_event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx) { - switch (boot_cpu_data.x86_model) { - case 1: - case 3: /* Pentium Pro */ - case 5: - case 6: /* Pentium II */ - case 7: - case 8: - case 11: /* Pentium III */ - event_constraints = intel_p6_event_constraints; - break; - case 9: - case 13: - /* Pentium M */ - event_constraints = intel_p6_event_constraints; - break; - default: - pr_cont("unsupported p6 CPU model %d ", - boot_cpu_data.x86_model); - return -ENODEV; - } + int ret = 0; - x86_pmu = p6_pmu; + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = smp_processor_id(); + event->tstamp_running += event->ctx->time - event->tstamp_stopped; - return 0; + if (!is_x86_event(event)) + ret = event->pmu->enable(event); + + if (!ret && !is_software_event(event)) + cpuctx->active_oncpu++; + + if (!ret && event->attr.exclusive) + cpuctx->exclusive = 1; + + return ret; } -static __init int intel_pmu_init(void) +static void x86_event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx) { - union cpuid10_edx edx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int ebx; - int version; - - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - /* check for P6 processor family */ - if (boot_cpu_data.x86 == 6) { - return p6_pmu_init(); - } else { - return -ENODEV; - } - } + event->state = PERF_EVENT_STATE_INACTIVE; + event->oncpu = -1; - /* - * Check whether the Architectural PerfMon supports - * Branch Misses Retired hw_event or not. - */ - cpuid(10, &eax.full, &ebx, &unused, &edx.full); - if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return -ENODEV; + if (!is_x86_event(event)) + event->pmu->disable(event); - version = eax.split.version_id; - if (version < 2) - return -ENODEV; + event->tstamp_running -= event->ctx->time - event->tstamp_stopped; - x86_pmu = intel_pmu; - x86_pmu.version = version; - x86_pmu.num_events = eax.split.num_events; - x86_pmu.event_bits = eax.split.bit_width; - x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; + if (!is_software_event(event)) + cpuctx->active_oncpu--; + if (event->attr.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + +/* + * Called to enable a whole group of events. + * Returns 1 if the group was enabled, or -EAGAIN if it could not be. + * Assumes the caller has disabled interrupts and has + * frozen the PMU with hw_perf_save_disable. + * + * called with PMU disabled. If successful and return value 1, + * then guaranteed to call perf_enable() and hw_perf_enable() + */ +int hw_perf_group_sched_in(struct perf_event *leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_event *sub; + int assign[X86_PMC_IDX_MAX]; + int n0, n1, ret; + + /* n0 = total number of events */ + n0 = collect_events(cpuc, leader, true); + if (n0 < 0) + return n0; + + ret = x86_schedule_events(cpuc, n0, assign); + if (ret) + return ret; + + ret = x86_event_sched_in(leader, cpuctx); + if (ret) + return ret; + + n1 = 1; + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + if (sub->state > PERF_EVENT_STATE_OFF) { + ret = x86_event_sched_in(sub, cpuctx); + if (ret) + goto undo; + ++n1; + } + } /* - * Quirk: v2 perfmon does not report fixed-purpose events, so - * assume at least 3 events: + * copy new assignment, now we know it is possible + * will be used by hw_perf_enable() */ - x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); + memcpy(cpuc->assign, assign, n0*sizeof(int)); + + cpuc->n_events = n0; + cpuc->n_added += n1; + ctx->nr_active += n1; /* - * Install the hw-cache-events table: + * 1 means successful and events are active + * This is not quite true because we defer + * actual activation until hw_perf_enable() but + * this way we* ensure caller won't try to enable + * individual events */ - switch (boot_cpu_data.x86_model) { - case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ - case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ - case 29: /* six-core 45 nm xeon "Dunnington" */ - memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - pr_cont("Core2 events, "); - event_constraints = intel_core_event_constraints; - break; - default: - case 26: - memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + return 1; +undo: + x86_event_sched_out(leader, cpuctx); + n0 = 1; + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + if (sub->state == PERF_EVENT_STATE_ACTIVE) { + x86_event_sched_out(sub, cpuctx); + if (++n0 == n1) + break; + } + } + return ret; +} + +#include "perf_event_amd.c" +#include "perf_event_p6.c" +#include "perf_event_intel.c" - event_constraints = intel_nehalem_event_constraints; - pr_cont("Nehalem/Corei7 events, "); +static int __cpuinit +x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + int ret = NOTIFY_OK; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + if (x86_pmu.cpu_prepare) + ret = x86_pmu.cpu_prepare(cpu); break; - case 28: - memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - pr_cont("Atom events, "); + case CPU_STARTING: + if (x86_pmu.cpu_starting) + x86_pmu.cpu_starting(cpu); break; - } - return 0; -} -static __init int amd_pmu_init(void) -{ - /* Performance-monitoring supported from K7 and later: */ - if (boot_cpu_data.x86 < 6) - return -ENODEV; + case CPU_DYING: + if (x86_pmu.cpu_dying) + x86_pmu.cpu_dying(cpu); + break; - x86_pmu = amd_pmu; + case CPU_UP_CANCELED: + case CPU_DEAD: + if (x86_pmu.cpu_dead) + x86_pmu.cpu_dead(cpu); + break; - /* Events are common for all AMDs */ - memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + default: + break; + } - return 0; + return ret; } static void __init pmu_check_apic(void) @@ -2169,6 +1378,7 @@ static void __init pmu_check_apic(void) void __init init_hw_perf_events(void) { + struct event_constraint *c; int err; pr_info("Performance Events: "); @@ -2213,6 +1423,20 @@ void __init init_hw_perf_events(void) perf_events_lapic_init(); register_die_notifier(&perf_event_nmi_notifier); + unconstrained = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, + 0, x86_pmu.num_events); + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if (c->cmask != INTEL_ARCH_FIXED_MASK) + continue; + + c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1; + c->weight += x86_pmu.num_events; + } + } + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.event_bits); pr_info("... generic registers: %d\n", x86_pmu.num_events); @@ -2220,60 +1444,91 @@ void __init init_hw_perf_events(void) pr_info("... max period: %016Lx\n", x86_pmu.max_period); pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); pr_info("... event mask: %016Lx\n", perf_event_mask); + + perf_cpu_notifier(x86_pmu_notifier); } static inline void x86_pmu_read(struct perf_event *event) { - x86_perf_event_update(event, &event->hw, event->hw.idx); + x86_perf_event_update(event); } static const struct pmu pmu = { .enable = x86_pmu_enable, .disable = x86_pmu_disable, + .start = x86_pmu_start, + .stop = x86_pmu_stop, .read = x86_pmu_read, .unthrottle = x86_pmu_unthrottle, }; -static int -validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) -{ - struct hw_perf_event fake_event = event->hw; - - if (event->pmu && event->pmu != &pmu) - return 0; - - return x86_schedule_event(cpuc, &fake_event) >= 0; -} - +/* + * validate a single event group + * + * validation include: + * - check events are compatible which each other + * - events do not compete for the same counter + * - number of events <= number of counters + * + * validation ensures the group can be loaded onto the + * PMU if it was the only group available. + */ static int validate_group(struct perf_event *event) { - struct perf_event *sibling, *leader = event->group_leader; - struct cpu_hw_events fake_pmu; + struct perf_event *leader = event->group_leader; + struct cpu_hw_events *fake_cpuc; + int ret, n; - memset(&fake_pmu, 0, sizeof(fake_pmu)); + ret = -ENOMEM; + fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); + if (!fake_cpuc) + goto out; - if (!validate_event(&fake_pmu, leader)) - return -ENOSPC; + /* + * the event is not yet connected with its + * siblings therefore we must first collect + * existing siblings, then add the new event + * before we can simulate the scheduling + */ + ret = -ENOSPC; + n = collect_events(fake_cpuc, leader, true); + if (n < 0) + goto out_free; - list_for_each_entry(sibling, &leader->sibling_list, group_entry) { - if (!validate_event(&fake_pmu, sibling)) - return -ENOSPC; - } + fake_cpuc->n_events = n; + n = collect_events(fake_cpuc, event, false); + if (n < 0) + goto out_free; - if (!validate_event(&fake_pmu, event)) - return -ENOSPC; + fake_cpuc->n_events = n; - return 0; + ret = x86_schedule_events(fake_cpuc, n, NULL); + +out_free: + kfree(fake_cpuc); +out: + return ret; } const struct pmu *hw_perf_event_init(struct perf_event *event) { + const struct pmu *tmp; int err; err = __hw_perf_event_init(event); if (!err) { + /* + * we temporarily connect event to its pmu + * such that validate_group() can classify + * it as an x86 event using is_x86_event() + */ + tmp = event->pmu; + event->pmu = &pmu; + if (event->group_leader != event) err = validate_group(event); + + event->pmu = tmp; } if (err) { if (event->destroy) @@ -2297,7 +1552,6 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); -static DEFINE_PER_CPU(int, in_ignored_frame); static void @@ -2313,10 +1567,6 @@ static void backtrace_warning(void *data, char *msg) static int backtrace_stack(void *data, char *name) { - per_cpu(in_ignored_frame, smp_processor_id()) = - x86_is_stack_id(NMI_STACK, name) || - x86_is_stack_id(DEBUG_STACK, name); - return 0; } @@ -2324,9 +1574,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; - if (per_cpu(in_ignored_frame, smp_processor_id())) - return; - if (reliable) callchain_store(entry, addr); } @@ -2347,7 +1594,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, PERF_CONTEXT_KERNEL); callchain_store(entry, regs->ip); - dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); + dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); } /* @@ -2385,14 +1632,42 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) return len; } -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +#ifdef CONFIG_COMPAT +static inline int +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) { - unsigned long bytes; + /* 32-bit process in 64-bit kernel. */ + struct stack_frame_ia32 frame; + const void __user *fp; + + if (!test_thread_flag(TIF_IA32)) + return 0; + + fp = compat_ptr(regs->bp); + while (entry->nr < PERF_MAX_STACK_DEPTH) { + unsigned long bytes; + frame.next_frame = 0; + frame.return_address = 0; - bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); + bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (bytes != sizeof(frame)) + break; + + if (fp < compat_ptr(regs->sp)) + break; - return bytes == sizeof(*frame); + callchain_store(entry, frame.return_address); + fp = compat_ptr(frame.next_frame); + } + return 1; +} +#else +static inline int +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + return 0; } +#endif static void perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) @@ -2408,11 +1683,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, PERF_CONTEXT_USER); callchain_store(entry, regs->ip); + if (perf_callchain_user32(regs, entry)) + return; + while (entry->nr < PERF_MAX_STACK_DEPTH) { + unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) + bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (bytes != sizeof(frame)) break; if ((unsigned long)fp < regs->sp) @@ -2433,9 +1713,6 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) is_user = user_mode(regs); - if (!current || current->pid == 0) - return; - if (is_user && current->state != TASK_RUNNING) return; @@ -2462,7 +1739,14 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } -void hw_perf_event_setup_online(int cpu) +void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) { - init_debug_store_on_cpu(cpu); + regs->ip = ip; + /* + * perf_arch_fetch_caller_regs adds another call, we need to increment + * the skip level + */ + regs->bp = rewind_frame_pointer(skip + 1); + regs->cs = __KERNEL_CS; + local_save_flags(regs->flags); } diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c new file mode 100644 index 00000000000..db6f7d4056e --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -0,0 +1,422 @@ +#ifdef CONFIG_CPU_SUP_AMD + +static DEFINE_RAW_SPINLOCK(amd_nb_lock); + +static __initconst u64 amd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ + [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ + [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ + [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ + [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ + [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +/* + * AMD Performance Monitor K7 and later. + */ +static const u64 amd_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, +}; + +static u64 amd_pmu_event_map(int hw_event) +{ + return amd_perfmon_event_map[hw_event]; +} + +static u64 amd_pmu_raw_event(u64 hw_event) +{ +#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL +#define K7_EVNTSEL_INV_MASK 0x000800000ULL +#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL + +#define K7_EVNTSEL_MASK \ + (K7_EVNTSEL_EVENT_MASK | \ + K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_EDGE_MASK | \ + K7_EVNTSEL_INV_MASK | \ + K7_EVNTSEL_REG_MASK) + + return hw_event & K7_EVNTSEL_MASK; +} + +/* + * AMD64 events are detected based on their event codes. + */ +static inline int amd_is_nb_event(struct hw_perf_event *hwc) +{ + return (hwc->config & 0xe0) == 0xe0; +} + +static inline int amd_has_nb(struct cpu_hw_events *cpuc) +{ + struct amd_nb *nb = cpuc->amd_nb; + + return nb && nb->nb_id != -1; +} + +static void amd_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + int i; + + /* + * only care about NB events + */ + if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) + return; + + /* + * need to scan whole list because event may not have + * been assigned during scheduling + * + * no race condition possible because event can only + * be removed on one CPU at a time AND PMU is disabled + * when we come here + */ + for (i = 0; i < x86_pmu.num_events; i++) { + if (nb->owners[i] == event) { + cmpxchg(nb->owners+i, event, NULL); + break; + } + } +} + + /* + * AMD64 NorthBridge events need special treatment because + * counter access needs to be synchronized across all cores + * of a package. Refer to BKDG section 3.12 + * + * NB events are events measuring L3 cache, Hypertransport + * traffic. They are identified by an event code >= 0xe00. + * They measure events on the NorthBride which is shared + * by all cores on a package. NB events are counted on a + * shared set of counters. When a NB event is programmed + * in a counter, the data actually comes from a shared + * counter. Thus, access to those counters needs to be + * synchronized. + * + * We implement the synchronization such that no two cores + * can be measuring NB events using the same counters. Thus, + * we maintain a per-NB allocation table. The available slot + * is propagated using the event_constraint structure. + * + * We provide only one choice for each NB event based on + * the fact that only NB events have restrictions. Consequently, + * if a counter is available, there is a guarantee the NB event + * will be assigned to it. If no slot is available, an empty + * constraint is returned and scheduling will eventually fail + * for this event. + * + * Note that all cores attached the same NB compete for the same + * counters to host NB events, this is why we use atomic ops. Some + * multi-chip CPUs may have more than one NB. + * + * Given that resources are allocated (cmpxchg), they must be + * eventually freed for others to use. This is accomplished by + * calling amd_put_event_constraints(). + * + * Non NB events are not impacted by this restriction. + */ +static struct event_constraint * +amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + struct perf_event *old = NULL; + int max = x86_pmu.num_events; + int i, j, k = -1; + + /* + * if not NB event or no NB, then no constraints + */ + if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) + return &unconstrained; + + /* + * detect if already present, if so reuse + * + * cannot merge with actual allocation + * because of possible holes + * + * event can already be present yet not assigned (in hwc->idx) + * because of successive calls to x86_schedule_events() from + * hw_perf_group_sched_in() without hw_perf_enable() + */ + for (i = 0; i < max; i++) { + /* + * keep track of first free slot + */ + if (k == -1 && !nb->owners[i]) + k = i; + + /* already present, reuse */ + if (nb->owners[i] == event) + goto done; + } + /* + * not present, so grab a new slot + * starting either at: + */ + if (hwc->idx != -1) { + /* previous assignment */ + i = hwc->idx; + } else if (k != -1) { + /* start from free slot found */ + i = k; + } else { + /* + * event not found, no slot found in + * first pass, try again from the + * beginning + */ + i = 0; + } + j = i; + do { + old = cmpxchg(nb->owners+i, NULL, event); + if (!old) + break; + if (++i == max) + i = 0; + } while (i != j); +done: + if (!old) + return &nb->event_constraints[i]; + + return &emptyconstraint; +} + +static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) +{ + struct amd_nb *nb; + int i; + + nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL); + if (!nb) + return NULL; + + memset(nb, 0, sizeof(*nb)); + nb->nb_id = nb_id; + + /* + * initialize all possible NB constraints + */ + for (i = 0; i < x86_pmu.num_events; i++) { + __set_bit(i, nb->event_constraints[i].idxmsk); + nb->event_constraints[i].weight = 1; + } + return nb; +} + +static int amd_pmu_cpu_prepare(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + WARN_ON_ONCE(cpuc->amd_nb); + + if (boot_cpu_data.x86_max_cores < 2) + return NOTIFY_OK; + + cpuc->amd_nb = amd_alloc_nb(cpu, -1); + if (!cpuc->amd_nb) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +static void amd_pmu_cpu_starting(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct amd_nb *nb; + int i, nb_id; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + nb_id = amd_get_nb_id(cpu); + WARN_ON_ONCE(nb_id == BAD_APICID); + + raw_spin_lock(&amd_nb_lock); + + for_each_online_cpu(i) { + nb = per_cpu(cpu_hw_events, i).amd_nb; + if (WARN_ON_ONCE(!nb)) + continue; + + if (nb->nb_id == nb_id) { + kfree(cpuc->amd_nb); + cpuc->amd_nb = nb; + break; + } + } + + cpuc->amd_nb->nb_id = nb_id; + cpuc->amd_nb->refcnt++; + + raw_spin_unlock(&amd_nb_lock); +} + +static void amd_pmu_cpu_dead(int cpu) +{ + struct cpu_hw_events *cpuhw; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + cpuhw = &per_cpu(cpu_hw_events, cpu); + + raw_spin_lock(&amd_nb_lock); + + if (cpuhw->amd_nb) { + struct amd_nb *nb = cpuhw->amd_nb; + + if (nb->nb_id == -1 || --nb->refcnt == 0) + kfree(nb); + + cpuhw->amd_nb = NULL; + } + + raw_spin_unlock(&amd_nb_lock); +} + +static __initconst struct x86_pmu amd_pmu = { + .name = "AMD", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_events = 4, + .event_bits = 48, + .event_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = amd_get_event_constraints, + .put_event_constraints = amd_put_event_constraints, + + .cpu_prepare = amd_pmu_cpu_prepare, + .cpu_starting = amd_pmu_cpu_starting, + .cpu_dead = amd_pmu_cpu_dead, +}; + +static __init int amd_pmu_init(void) +{ + /* Performance-monitoring supported from K7 and later: */ + if (boot_cpu_data.x86 < 6) + return -ENODEV; + + x86_pmu = amd_pmu; + + /* Events are common for all AMDs */ + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + return 0; +} + +#else /* CONFIG_CPU_SUP_AMD */ + +static int amd_pmu_init(void) +{ + return 0; +} + +#endif diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c new file mode 100644 index 00000000000..9c794ac8783 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -0,0 +1,980 @@ +#ifdef CONFIG_CPU_SUP_INTEL + +/* + * Intel PerfMon, used on Core and later. + */ +static const u64 intel_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, +}; + +static struct event_constraint intel_core_event_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_core2_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* + * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event + * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed + * ratio between these counters. + */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_nehalem_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ + INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ + INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ + INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */ + INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_westmere_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_gen_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + EVENT_CONSTRAINT_END +}; + +static u64 intel_pmu_event_map(int hw_event) +{ + return intel_perfmon_event_map[hw_event]; +} + +static __initconst u64 westmere_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst u64 nehalem_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst u64 core2_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst u64 atom_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static u64 intel_pmu_raw_event(u64 hw_event) +{ +#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL +#define CORE_EVNTSEL_INV_MASK 0x00800000ULL +#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL + +#define CORE_EVNTSEL_MASK \ + (INTEL_ARCH_EVTSEL_MASK | \ + INTEL_ARCH_UNIT_MASK | \ + INTEL_ARCH_EDGE_MASK | \ + INTEL_ARCH_INV_MASK | \ + INTEL_ARCH_CNT_MASK) + + return hw_event & CORE_EVNTSEL_MASK; +} + +static void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= X86_DEBUGCTL_TR; + debugctlmsr |= X86_DEBUGCTL_BTS; + debugctlmsr |= X86_DEBUGCTL_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_bts(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | + X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); +} + +static void intel_pmu_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_event *event = + cpuc->events[X86_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!event)) + return; + + intel_pmu_enable_bts(event->hw.config); + } +} + +static inline u64 intel_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void intel_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void +intel_pmu_disable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, mask; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + (void)checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void intel_pmu_drain_bts_buffer(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; + struct bts_record *at, *top; + struct perf_output_handle handle; + struct perf_event_header header; + struct perf_sample_data data; + struct pt_regs regs; + + if (!event) + return; + + if (!ds) + return; + + at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; + + if (top <= at) + return; + + ds->bts_index = ds->bts_buffer_base; + + perf_sample_data_init(&data, 0); + + data.period = event->hw.last_period; + regs.ip = 0; + + /* + * Prepare a generic sample, i.e. fill in the invariant fields. + * We will overwrite the from and to address before we output + * the sample. + */ + perf_prepare_sample(&header, &data, event, ®s); + + if (perf_output_begin(&handle, event, + header.size * (top - at), 1, 1)) + return; + + for (; at < top; at++) { + data.ip = at->from; + data.addr = at->to; + + perf_output_sample(&handle, &header, &data, event); + } + + perf_output_end(&handle); + + /* There's new data available. */ + event->hw.interrupts++; + event->pending_kill = POLL_IN; +} + +static inline void +intel_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + intel_pmu_drain_bts_buffer(); + return; + } + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_disable_fixed(hwc); + return; + } + + x86_pmu_disable_event(event); +} + +static inline void +intel_pmu_enable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + int err; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + + /* + * ANY bit is supported in v3 and up + */ + if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) + bits |= 0x4; + + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void intel_pmu_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { + if (!__get_cpu_var(cpu_hw_events).enabled) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_enable_fixed(hwc); + return; + } + + __x86_pmu_enable_event(hwc); +} + +/* + * Save and restart an expired event. Called by NMI contexts, + * so it has to be careful about preempting normal event ops: + */ +static int intel_pmu_save_and_restart(struct perf_event *event) +{ + x86_perf_event_update(event); + return x86_perf_event_set_period(event); +} + +static void intel_pmu_reset(void) +{ + struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; + unsigned long flags; + int idx; + + if (!x86_pmu.num_events) + return; + + local_irq_save(flags); + + printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); + checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + } + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { + checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } + if (ds) + ds->bts_index = ds->bts_buffer_base; + + local_irq_restore(flags); +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int bit, loops; + u64 ack, status; + + perf_sample_data_init(&data, 0); + + cpuc = &__get_cpu_var(cpu_hw_events); + + intel_pmu_disable_all(); + intel_pmu_drain_bts_buffer(); + status = intel_pmu_get_status(); + if (!status) { + intel_pmu_enable_all(); + return 0; + } + + loops = 0; +again: + if (++loops > 100) { + WARN_ONCE(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); + intel_pmu_reset(); + goto done; + } + + inc_irq_stat(apic_perf_irqs); + ack = status; + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + data.period = event->hw.last_period; + + if (perf_event_overflow(event, 1, &data, regs)) + x86_pmu_stop(event); + } + + intel_pmu_ack_status(ack); + + /* + * Repeat if there is more work to be done: + */ + status = intel_pmu_get_status(); + if (status) + goto again; + +done: + intel_pmu_enable_all(); + return 1; +} + +static struct event_constraint bts_constraint = + EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); + +static struct event_constraint * +intel_special_constraints(struct perf_event *event) +{ + unsigned int hw_event; + + hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK; + + if (unlikely((hw_event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (event->hw.sample_period == 1))) { + + return &bts_constraint; + } + return NULL; +} + +static struct event_constraint * +intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c; + + c = intel_special_constraints(event); + if (c) + return c; + + return x86_get_event_constraints(cpuc, event); +} + +static __initconst struct x86_pmu core_pmu = { + .name = "core", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .get_event_constraints = intel_get_event_constraints, + .event_constraints = intel_core_event_constraints, +}; + +static __initconst struct x86_pmu intel_pmu = { + .name = "Intel", + .handle_irq = intel_pmu_handle_irq, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, + .enable = intel_pmu_enable_event, + .disable = intel_pmu_disable_event, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .enable_bts = intel_pmu_enable_bts, + .disable_bts = intel_pmu_disable_bts, + .get_event_constraints = intel_get_event_constraints, + + .cpu_starting = init_debug_store_on_cpu, + .cpu_dying = fini_debug_store_on_cpu, +}; + +static __init int intel_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int ebx; + int version; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + /* check for P6 processor family */ + if (boot_cpu_data.x86 == 6) { + return p6_pmu_init(); + } else { + return -ENODEV; + } + } + + /* + * Check whether the Architectural PerfMon supports + * Branch Misses Retired hw_event or not. + */ + cpuid(10, &eax.full, &ebx, &unused, &edx.full); + if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + return -ENODEV; + + version = eax.split.version_id; + if (version < 2) + x86_pmu = core_pmu; + else + x86_pmu = intel_pmu; + + x86_pmu.version = version; + x86_pmu.num_events = eax.split.num_events; + x86_pmu.event_bits = eax.split.bit_width; + x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; + + /* + * Quirk: v2 perfmon does not report fixed-purpose events, so + * assume at least 3 events: + */ + if (version > 1) + x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); + + /* + * Install the hw-cache-events table: + */ + switch (boot_cpu_data.x86_model) { + case 14: /* 65 nm core solo/duo, "Yonah" */ + pr_cont("Core events, "); + break; + + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 29: /* six-core 45 nm xeon "Dunnington" */ + memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_core2_event_constraints; + pr_cont("Core2 events, "); + break; + + case 26: /* 45 nm nehalem, "Bloomfield" */ + case 30: /* 45 nm nehalem, "Lynnfield" */ + case 46: /* 45 nm nehalem-ex, "Beckton" */ + memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_nehalem_event_constraints; + pr_cont("Nehalem/Corei7 events, "); + break; + case 28: /* Atom */ + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("Atom events, "); + break; + + case 37: /* 32 nm nehalem, "Clarkdale" */ + case 44: /* 32 nm nehalem, "Gulftown" */ + memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_westmere_event_constraints; + pr_cont("Westmere events, "); + break; + + default: + /* + * default constraints for v2 and up + */ + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("generic architected perfmon, "); + } + return 0; +} + +#else /* CONFIG_CPU_SUP_INTEL */ + +static int intel_pmu_init(void) +{ + return 0; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c new file mode 100644 index 00000000000..a330485d14d --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -0,0 +1,159 @@ +#ifdef CONFIG_CPU_SUP_INTEL + +/* + * Not sure about some of these + */ +static const u64 p6_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, +}; + +static u64 p6_pmu_event_map(int hw_event) +{ + return p6_perfmon_event_map[hw_event]; +} + +/* + * Event setting that is specified not to count anything. + * We use this to effectively disable a counter. + * + * L2_RQSTS with 0 MESI unit mask. + */ +#define P6_NOP_EVENT 0x0000002EULL + +static u64 p6_pmu_raw_event(u64 hw_event) +{ +#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL +#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL +#define P6_EVNTSEL_INV_MASK 0x00800000ULL +#define P6_EVNTSEL_REG_MASK 0xFF000000ULL + +#define P6_EVNTSEL_MASK \ + (P6_EVNTSEL_EVENT_MASK | \ + P6_EVNTSEL_UNIT_MASK | \ + P6_EVNTSEL_EDGE_MASK | \ + P6_EVNTSEL_INV_MASK | \ + P6_EVNTSEL_REG_MASK) + + return hw_event & P6_EVNTSEL_MASK; +} + +static struct event_constraint p6_event_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT_END +}; + +static void p6_pmu_disable_all(void) +{ + u64 val; + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static void p6_pmu_enable_all(void) +{ + unsigned long val; + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val |= ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static inline void +p6_pmu_disable_event(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + u64 val = P6_NOP_EVENT; + + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); +} + +static void p6_pmu_enable_event(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + u64 val; + + val = hwc->config; + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); +} + +static __initconst struct x86_pmu p6_pmu = { + .name = "p6", + .handle_irq = x86_pmu_handle_irq, + .disable_all = p6_pmu_disable_all, + .enable_all = p6_pmu_enable_all, + .enable = p6_pmu_enable_event, + .disable = p6_pmu_disable_event, + .eventsel = MSR_P6_EVNTSEL0, + .perfctr = MSR_P6_PERFCTR0, + .event_map = p6_pmu_event_map, + .raw_event = p6_pmu_raw_event, + .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, + .max_period = (1ULL << 31) - 1, + .version = 0, + .num_events = 2, + /* + * Events have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a event for P6-like PMU is 32 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ + .event_bits = 32, + .event_mask = (1ULL << 32) - 1, + .get_event_constraints = x86_get_event_constraints, + .event_constraints = p6_event_constraints, +}; + +static __init int p6_pmu_init(void) +{ + switch (boot_cpu_data.x86_model) { + case 1: + case 3: /* Pentium Pro */ + case 5: + case 6: /* Pentium II */ + case 7: + case 8: + case 11: /* Pentium III */ + case 9: + case 13: + /* Pentium M */ + break; + default: + pr_cont("unsupported p6 CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + x86_pmu = p6_pmu; + + return 0; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 898df9719af..fb329e9f849 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -115,17 +115,6 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) return !test_bit(counter, perfctr_nmi_owner); } - -/* checks the an msr for availability */ -int avail_to_resrv_perfctr_nmi(unsigned int msr) -{ - unsigned int counter; - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - return !test_bit(counter, perfctr_nmi_owner); -} EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); int reserve_perfctr_nmi(unsigned int msr) @@ -691,7 +680,7 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) cpu_nmi_set_wd_enabled(); apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsr(evntsel_msr, evntsel, 0); intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); return 1; diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 1cbed97b59c..dfdb4dba232 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -22,6 +22,7 @@ */ #include <linux/dmi.h> +#include <linux/module.h> #include <asm/div64.h> #include <asm/vmware.h> #include <asm/x86_init.h> @@ -101,6 +102,7 @@ int vmware_platform(void) return 0; } +EXPORT_SYMBOL(vmware_platform); /* * VMware hypervisor takes care of exporting a reliable TSC to the guest. diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index cb27fd6136c..8b862d5900f 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -40,6 +40,7 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/uaccess.h> +#include <linux/gfp.h> #include <asm/processor.h> #include <asm/msr.h> @@ -229,7 +230,7 @@ static void __exit cpuid_exit(void) for_each_online_cpu(cpu) cpuid_device_destroy(cpu); class_destroy(cpuid_class); - unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); } diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index a4849c10a77..ebd4c51d096 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -27,7 +27,6 @@ #include <asm/cpu.h> #include <asm/reboot.h> #include <asm/virtext.h> -#include <asm/x86_init.h> #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) @@ -103,10 +102,5 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif - -#ifdef CONFIG_X86_64 - x86_platform.iommu_shutdown(); -#endif - crash_save_cpu(regs, safe_smp_processor_id()); } diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index cd97ce18c29..67414550c3c 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -5,6 +5,7 @@ * Copyright (C) IBM Corporation, 2004. All rights reserved */ +#include <linux/slab.h> #include <linux/errno.h> #include <linux/highmem.h> #include <linux/crash_dump.h> diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c56bc287303..6d817554780 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -123,13 +123,15 @@ print_context_stack_bp(struct thread_info *tinfo, while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) { unsigned long addr = *ret_addr; - if (__kernel_text_address(addr)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - ret_addr = &frame->return_address; - print_ftrace_graph_addr(addr, data, ops, tinfo, graph); - } + if (!__kernel_text_address(addr)) + break; + + ops->address(data, addr, 1); + frame = frame->next_frame; + ret_addr = &frame->return_address; + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); } + return (unsigned long)frame; } EXPORT_SYMBOL_GPL(print_context_stack_bp); diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index 4fd1420faff..e1a93be4fd4 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -14,6 +14,8 @@ #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) #endif +#include <linux/uaccess.h> + extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, char *log_lvl); @@ -29,4 +31,26 @@ struct stack_frame { struct stack_frame *next_frame; unsigned long return_address; }; + +struct stack_frame_ia32 { + u32 next_frame; + u32 return_address; +}; + +static inline unsigned long rewind_frame_pointer(int n) +{ + struct stack_frame *frame; + + get_bp(frame); + +#ifdef CONFIG_FRAME_POINTER + while (n--) { + if (probe_kernel_address(&frame->next_frame, frame)) + break; + } #endif + + return (unsigned long)frame; +} + +#endif /* DUMPSTACK_H */ diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index ae775ca47b2..11540a189d9 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -18,11 +18,6 @@ #include "dumpstack.h" -/* Just a stub for now */ -int x86_is_stack_id(int id, char *name) -{ - return 0; -} void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 0ad9597073f..272c9f1f05f 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -33,11 +33,6 @@ static char x86_stack_ids[][8] = { #endif }; -int x86_is_stack_id(int id, char *name) -{ - return x86_stack_ids[id - 1] == name; -} - static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, char **idp) { @@ -125,9 +120,15 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack, { #ifdef CONFIG_FRAME_POINTER struct stack_frame *frame = (struct stack_frame *)bp; + unsigned long next; - if (!in_irq_stack(stack, irq_stack, irq_stack_end)) - return (unsigned long)frame->next_frame; + if (!in_irq_stack(stack, irq_stack, irq_stack_end)) { + if (!probe_kernel_address(&frame->next_frame, next)) + return next; + else + WARN_ONCE(1, "Perf: bad frame pointer = %p in " + "callchain\n", &frame->next_frame); + } #endif return bp; } @@ -207,7 +208,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (in_irq_stack(stack, irq_stack, irq_stack_end)) { if (ops->stack(data, "IRQ") < 0) break; - bp = print_context_stack(tinfo, stack, bp, + bp = ops->walk_stack(tinfo, stack, bp, ops, data, irq_stack_end, &graph); /* * We link to the next stack (which would be @@ -228,7 +229,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, /* * This handles the process stack: */ - bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); + bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph); put_cpu(); } EXPORT_SYMBOL(dump_trace); @@ -291,6 +292,7 @@ void show_registers(struct pt_regs *regs) sp = regs->sp; printk("CPU %d ", cpu); + print_modules(); __show_regs(regs, 1); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", cur->comm, cur->pid, task_thread_info(cur), cur); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 05ed7ab2ca4..7bca3c6a02f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -12,21 +12,13 @@ #include <linux/types.h> #include <linux/init.h> #include <linux/bootmem.h> -#include <linux/ioport.h> -#include <linux/string.h> -#include <linux/kexec.h> -#include <linux/module.h> -#include <linux/mm.h> #include <linux/pfn.h> #include <linux/suspend.h> #include <linux/firmware-map.h> -#include <asm/pgtable.h> -#include <asm/page.h> #include <asm/e820.h> #include <asm/proto.h> #include <asm/setup.h> -#include <asm/trampoline.h> /* * The e820 map is the map that gets modified e.g. with command line parameters @@ -517,31 +509,55 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, int checktype) { int i; + u64 end; u64 real_removed_size = 0; if (size > (ULLONG_MAX - start)) size = ULLONG_MAX - start; + end = start + size; + printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", + (unsigned long long) start, + (unsigned long long) end); + if (checktype) + e820_print_type(old_type); + printk(KERN_CONT "\n"); + for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; u64 final_start, final_end; + u64 ei_end; if (checktype && ei->type != old_type) continue; + + ei_end = ei->addr + ei->size; /* totally covered? */ - if (ei->addr >= start && - (ei->addr + ei->size) <= (start + size)) { + if (ei->addr >= start && ei_end <= end) { real_removed_size += ei->size; memset(ei, 0, sizeof(struct e820entry)); continue; } + + /* new range is totally covered? */ + if (ei->addr < start && ei_end > end) { + e820_add_region(end, ei_end - end, ei->type); + ei->size = start - ei->addr; + real_removed_size += size; + continue; + } + /* partially covered */ final_start = max(start, ei->addr); - final_end = min(start + size, ei->addr + ei->size); + final_end = min(end, ei_end); if (final_start >= final_end) continue; real_removed_size += final_end - final_start; + /* + * left range could be head or tail, so need to update + * size at first. + */ ei->size -= final_end - final_start; if (ei->addr < final_start) continue; @@ -722,319 +738,44 @@ core_initcall(e820_mark_nvs_memory); #endif /* - * Early reserved memory areas. - */ -#define MAX_EARLY_RES 32 - -struct early_res { - u64 start, end; - char name[16]; - char overlap_ok; -}; -static struct early_res early_res[MAX_EARLY_RES] __initdata = { - { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */ -#ifdef CONFIG_X86_32 - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - { PAGE_SIZE, PAGE_SIZE, "EX TRAMPOLINE", 1 }, -#endif - - {} -}; - -static int __init find_overlapped_early(u64 start, u64 end) -{ - int i; - struct early_res *r; - - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { - r = &early_res[i]; - if (end > r->start && start < r->end) - break; - } - - return i; -} - -/* - * Drop the i-th range from the early reservation map, - * by copying any higher ranges down one over it, and - * clearing what had been the last slot. - */ -static void __init drop_range(int i) -{ - int j; - - for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) - ; - - memmove(&early_res[i], &early_res[i + 1], - (j - 1 - i) * sizeof(struct early_res)); - - early_res[j - 1].end = 0; -} - -/* - * Split any existing ranges that: - * 1) are marked 'overlap_ok', and - * 2) overlap with the stated range [start, end) - * into whatever portion (if any) of the existing range is entirely - * below or entirely above the stated range. Drop the portion - * of the existing range that overlaps with the stated range, - * which will allow the caller of this routine to then add that - * stated range without conflicting with any existing range. + * Find a free area with specified alignment in a specific range. */ -static void __init drop_overlaps_that_are_ok(u64 start, u64 end) +u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) { int i; - struct early_res *r; - u64 lower_start, lower_end; - u64 upper_start, upper_end; - char name[16]; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { - r = &early_res[i]; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr; + u64 ei_start, ei_last; - /* Continue past non-overlapping ranges */ - if (end <= r->start || start >= r->end) + if (ei->type != E820_RAM) continue; - /* - * Leave non-ok overlaps as is; let caller - * panic "Overlapping early reservations" - * when it hits this overlap. - */ - if (!r->overlap_ok) - return; - - /* - * We have an ok overlap. We will drop it from the early - * reservation map, and add back in any non-overlapping - * portions (lower or upper) as separate, overlap_ok, - * non-overlapping ranges. - */ - - /* 1. Note any non-overlapping (lower or upper) ranges. */ - strncpy(name, r->name, sizeof(name) - 1); - - lower_start = lower_end = 0; - upper_start = upper_end = 0; - if (r->start < start) { - lower_start = r->start; - lower_end = start; - } - if (r->end > end) { - upper_start = end; - upper_end = r->end; - } - - /* 2. Drop the original ok overlapping range */ - drop_range(i); - - i--; /* resume for-loop on copied down entry */ - - /* 3. Add back in any non-overlapping ranges. */ - if (lower_end) - reserve_early_overlap_ok(lower_start, lower_end, name); - if (upper_end) - reserve_early_overlap_ok(upper_start, upper_end, name); - } -} - -static void __init __reserve_early(u64 start, u64 end, char *name, - int overlap_ok) -{ - int i; - struct early_res *r; - - i = find_overlapped_early(start, end); - if (i >= MAX_EARLY_RES) - panic("Too many early reservations"); - r = &early_res[i]; - if (r->end) - panic("Overlapping early reservations " - "%llx-%llx %s to %llx-%llx %s\n", - start, end - 1, name?name:"", r->start, - r->end - 1, r->name); - r->start = start; - r->end = end; - r->overlap_ok = overlap_ok; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); -} - -/* - * A few early reservtations come here. - * - * The 'overlap_ok' in the name of this routine does -not- mean it - * is ok for these reservations to overlap an earlier reservation. - * Rather it means that it is ok for subsequent reservations to - * overlap this one. - * - * Use this entry point to reserve early ranges when you are doing - * so out of "Paranoia", reserving perhaps more memory than you need, - * just in case, and don't mind a subsequent overlapping reservation - * that is known to be needed. - * - * The drop_overlaps_that_are_ok() call here isn't really needed. - * It would be needed if we had two colliding 'overlap_ok' - * reservations, so that the second such would not panic on the - * overlap with the first. We don't have any such as of this - * writing, but might as well tolerate such if it happens in - * the future. - */ -void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) -{ - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 1); -} - -/* - * Most early reservations come here. - * - * We first have drop_overlaps_that_are_ok() drop any pre-existing - * 'overlap_ok' ranges, so that we can then reserve this memory - * range without risk of panic'ing on an overlapping overlap_ok - * early reservation. - */ -void __init reserve_early(u64 start, u64 end, char *name) -{ - if (start >= end) - return; - - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 0); -} - -void __init free_early(u64 start, u64 end) -{ - struct early_res *r; - int i; - - i = find_overlapped_early(start, end); - r = &early_res[i]; - if (i >= MAX_EARLY_RES || r->end != end || r->start != start) - panic("free_early on not reserved area: %llx-%llx!", - start, end - 1); - - drop_range(i); -} - -void __init early_res_to_bootmem(u64 start, u64 end) -{ - int i, count; - u64 final_start, final_end; - - count = 0; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) - count++; - - printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n", - count, start, end); - for (i = 0; i < count; i++) { - struct early_res *r = &early_res[i]; - printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, - r->start, r->end, r->name); - final_start = max(start, r->start); - final_end = min(end, r->end); - if (final_start >= final_end) { - printk(KERN_CONT "\n"); - continue; - } - printk(KERN_CONT " ==> [%010llx - %010llx]\n", - final_start, final_end); - reserve_bootmem_generic(final_start, final_end - final_start, - BOOTMEM_DEFAULT); - } -} + ei_last = ei->addr + ei->size; + ei_start = ei->addr; + addr = find_early_area(ei_start, ei_last, start, end, + size, align); -/* Check for already reserved areas */ -static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) -{ - int i; - u64 addr = *addrp; - int changed = 0; - struct early_res *r; -again: - i = find_overlapped_early(addr, addr + size); - r = &early_res[i]; - if (i < MAX_EARLY_RES && r->end) { - *addrp = addr = round_up(r->end, align); - changed = 1; - goto again; + if (addr != -1ULL) + return addr; } - return changed; + return -1ULL; } -/* Check for already reserved areas */ -static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) +u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) { - int i; - u64 addr = *addrp, last; - u64 size = *sizep; - int changed = 0; -again: - last = addr + size; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { - struct early_res *r = &early_res[i]; - if (last > r->start && addr < r->start) { - size = r->start - addr; - changed = 1; - goto again; - } - if (last > r->end && addr < r->end) { - addr = round_up(r->end, align); - size = last - addr; - changed = 1; - goto again; - } - if (last <= r->end && addr >= r->start) { - (*sizep)++; - return 0; - } - } - if (changed) { - *addrp = addr; - *sizep = size; - } - return changed; + return find_e820_area(start, end, size, align); } -/* - * Find a free area with specified alignment in a specific range. - */ -u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) +u64 __init get_max_mapped(void) { - int i; + u64 end = max_pfn_mapped; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 addr, last; - u64 ei_last; + end <<= PAGE_SHIFT; - if (ei->type != E820_RAM) - continue; - addr = round_up(ei->addr, align); - ei_last = ei->addr + ei->size; - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - while (bad_addr(&addr, size, align) && addr+size <= ei_last) - ; - last = addr + size; - if (last > ei_last) - continue; - if (last > end) - continue; - return addr; - } - return -1ULL; + return end; } - /* * Find next free range after *start */ @@ -1044,25 +785,19 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; - u64 addr, last; - u64 ei_last; + u64 addr; + u64 ei_start, ei_last; if (ei->type != E820_RAM) continue; - addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - *sizep = ei_last - addr; - while (bad_addr_size(&addr, sizep, align) && - addr + *sizep <= ei_last) - ; - last = addr + *sizep; - if (last > ei_last) - continue; - return addr; + ei_start = ei->addr; + addr = find_early_area_size(ei_start, ei_last, start, + sizep, align); + + if (addr != -1ULL) + return addr; } return -1ULL; @@ -1421,6 +1156,8 @@ void __init e820_reserve_resources_late(void) end = MAX_RESOURCE_SIZE; if (start >= end) continue; + printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ", + start, end); reserve_region_with_split(&iomem_resource, start, end, "RAM buffer"); } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index cdcfb122f25..c2fa9b8b497 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -362,7 +362,7 @@ void __init efi_init(void) printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); early_iounmap(tmp, 2); - printk(KERN_INFO "EFI v%u.%.02u by %s \n", + printk(KERN_INFO "EFI v%u.%.02u by %s\n", efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 30968924543..cd37469b54e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -30,14 +30,32 @@ #ifdef CONFIG_DYNAMIC_FTRACE +/* + * modifying_code is set to notify NMIs that they need to use + * memory barriers when entering or exiting. But we don't want + * to burden NMIs with unnecessary memory barriers when code + * modification is not being done (which is most of the time). + * + * A mutex is already held when ftrace_arch_code_modify_prepare + * and post_process are called. No locks need to be taken here. + * + * Stop machine will make sure currently running NMIs are done + * and new NMIs will see the updated variable before we need + * to worry about NMIs doing memory barriers. + */ +static int modifying_code __read_mostly; +static DEFINE_PER_CPU(int, save_modifying_code); + int ftrace_arch_code_modify_prepare(void) { set_kernel_text_rw(); + modifying_code = 1; return 0; } int ftrace_arch_code_modify_post_process(void) { + modifying_code = 0; set_kernel_text_ro(); return 0; } @@ -149,6 +167,11 @@ static void ftrace_mod_code(void) void ftrace_nmi_enter(void) { + __get_cpu_var(save_modifying_code) = modifying_code; + + if (!__get_cpu_var(save_modifying_code)) + return; + if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { smp_rmb(); ftrace_mod_code(); @@ -160,6 +183,9 @@ void ftrace_nmi_enter(void) void ftrace_nmi_exit(void) { + if (!__get_cpu_var(save_modifying_code)) + return; + /* Finish all executions before clearing nmi_running */ smp_mb(); atomic_dec(&nmi_running); @@ -484,13 +510,3 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -#ifdef CONFIG_FTRACE_SYSCALLS - -extern unsigned long *sys_call_table; - -unsigned long __init arch_syscall_addr(int nr) -{ - return (unsigned long)(&sys_call_table)[nr]; -} -#endif diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 5051b94c906..b2e24603739 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -7,6 +7,7 @@ #include <linux/init.h> #include <linux/start_kernel.h> +#include <linux/mm.h> #include <asm/setup.h> #include <asm/sections.h> @@ -29,14 +30,25 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { +#ifdef CONFIG_X86_TRAMPOLINE + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, + "EX TRAMPOLINE"); +#endif + reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 ramdisk_end = ramdisk_image + ramdisk_size; + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b5a9896ca1e..7147143fd61 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -103,9 +103,10 @@ void __init x86_64_start_reservations(char *real_mode_data) #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + /* Assume only end is not page aligned */ unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = ramdisk_image + ramdisk_size; + unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 7fd318bac59..37c3d4b17d8 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -442,8 +442,8 @@ is386: movl $2,%ecx # set MP */ cmpb $0,ready jne 1f - movl $per_cpu__gdt_page,%eax - movl $per_cpu__stack_canary,%ecx + movl $gdt_page,%eax + movl $stack_canary,%ecx movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) shrl $16, %ecx movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) @@ -706,7 +706,7 @@ idt_descr: .word 0 # 32 bit align gdt_desc.address ENTRY(early_gdt_descr) .word GDT_ENTRIES*8-1 - .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ + .long gdt_page /* Overwritten for secondary CPUs */ /* * The boot_gdt must mirror the equivalent in setup.S and is diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 2d8b5035371..3d1e6f16b7a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -27,7 +27,7 @@ #define GET_CR2_INTO_RCX movq %cr2, %rcx #endif -/* we are not able to switch in one step to the final KERNEL ADRESS SPACE +/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE * because we need identity-mapped pages. * */ diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ba6e6588460..23b4ecdffa9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -4,6 +4,7 @@ #include <linux/sysdev.h> #include <linux/delay.h> #include <linux/errno.h> +#include <linux/slab.h> #include <linux/hpet.h> #include <linux/init.h> #include <linux/cpu.h> @@ -34,6 +35,8 @@ */ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ +u8 hpet_msi_disable; + #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; #endif @@ -264,7 +267,7 @@ static void hpet_resume_device(void) force_hpet_resume(); } -static void hpet_resume_counter(void) +static void hpet_resume_counter(struct clocksource *cs) { hpet_resume_device(); hpet_restart_counter(); @@ -397,9 +400,15 @@ static int hpet_next_event(unsigned long delta, * then we might have a real hardware problem. We can not do * much about it here, but at least alert the user/admin with * a prominent warning. + * An erratum on some chipsets (ICH9,..), results in comparator read + * immediately following a write returning old value. Workaround + * for this is to read this value second time, when first + * read returns old value. */ - WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, + if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { + WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, KERN_WARNING "hpet: compare register read back failed.\n"); + } return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } @@ -596,6 +605,9 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) unsigned int num_timers_used = 0; int i; + if (hpet_msi_disable) + return; + if (boot_cpu_has(X86_FEATURE_ARAT)) return; id = hpet_readl(HPET_ID); @@ -928,6 +940,9 @@ static __init int hpet_late_init(void) hpet_reserve_platform_timers(hpet_readl(HPET_ID)); hpet_print_config(); + if (hpet_msi_disable) + return 0; + if (boot_cpu_has(X86_FEATURE_ARAT)) return 0; @@ -1135,6 +1150,7 @@ int hpet_set_periodic_freq(unsigned long freq) do_div(clc, freq); clc >>= hpet_clockevent.shift; hpet_pie_delta = clc; + hpet_pie_limit = 0; } return 1; } diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 05d5fec64a9..d6cc065f519 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -212,25 +212,6 @@ static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); } -/* - * Store a breakpoint's encoded address, length, and type. - */ -static int arch_store_info(struct perf_event *bp) -{ - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - /* - * For kernel-addresses, either the address or symbol name can be - * specified. - */ - if (info->name) - info->address = (unsigned long) - kallsyms_lookup_name(info->name); - if (info->address) - return 0; - - return -EINVAL; -} - int arch_bp_generic_fields(int x86_len, int x86_type, int *gen_len, int *gen_type) { @@ -362,10 +343,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, return ret; } - ret = arch_store_info(bp); - - if (ret < 0) - return ret; /* * Check that the low-order bits of the address are appropriate * for the alignment implied by len. @@ -502,8 +479,6 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) rcu_read_lock(); bp = per_cpu(bp_per_reg[i], cpu); - if (bp) - rc = NOTIFY_DONE; /* * Reset the 'i'th TRAP bit in dr6 to denote completion of * exception handling @@ -522,7 +497,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) rcu_read_unlock(); } - if (dr6 & (~DR_TRAP_BITS)) + /* + * Further processing in do_debug() is needed for a) user-space + * breakpoints (to generate signals) and b) when the system has + * taken exception due to multiple causes + */ + if ((current->thread.debugreg6 & DR_TRAP_BITS) || + (dr6 & (~DR_TRAP_BITS))) rc = NOTIFY_DONE; set_debugreg(dr7, 7); @@ -547,8 +528,3 @@ void hw_breakpoint_pmu_read(struct perf_event *bp) { /* TODO */ } - -void hw_breakpoint_pmu_unthrottle(struct perf_event *bp) -{ - /* TODO */ -} diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index f2f8540a7f3..54c31c28548 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/regset.h> #include <linux/sched.h> +#include <linux/slab.h> #include <asm/sigcontext.h> #include <asm/processor.h> @@ -164,6 +165,11 @@ int init_fpu(struct task_struct *tsk) return 0; } +/* + * The xstateregs_active() routine is the same as the fpregs_active() routine, + * as the "regset->n" for the xstate regset will be updated based on the feature + * capabilites supported by the xsave. + */ int fpregs_active(struct task_struct *target, const struct user_regset *regset) { return tsk_used_math(target) ? regset->n : 0; @@ -204,8 +210,6 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - set_stopped_child_used_math(target); - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.xstate->fxsave, 0, -1); @@ -224,6 +228,68 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, return ret; } +int xstateregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + if (!cpu_has_xsave) + return -ENODEV; + + ret = init_fpu(target); + if (ret) + return ret; + + /* + * Copy the 48bytes defined by the software first into the xstate + * memory layout in the thread struct, so that we can copy the entire + * xstateregs to the user using one user_regset_copyout(). + */ + memcpy(&target->thread.xstate->fxsave.sw_reserved, + xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); + + /* + * Copy the xstate memory layout. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.xstate->xsave, 0, -1); + return ret; +} + +int xstateregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct xsave_hdr_struct *xsave_hdr; + + if (!cpu_has_xsave) + return -ENODEV; + + ret = init_fpu(target); + if (ret) + return ret; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.xstate->xsave, 0, -1); + + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; + + xsave_hdr = &target->thread.xstate->xsave.xsave_hdr; + + xsave_hdr->xstate_bv &= pcntxt_mask; + /* + * These bits must be zero. + */ + xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + + return ret; +} + #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION /* @@ -404,8 +470,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - set_stopped_child_used_math(target); - if (!HAVE_HWFP) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index df89102bef8..7c9f02c130f 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -5,7 +5,6 @@ #include <linux/ioport.h> #include <linux/interrupt.h> #include <linux/timex.h> -#include <linux/slab.h> #include <linux/random.h> #include <linux/init.h> #include <linux/kernel_stat.h> @@ -32,8 +31,14 @@ */ static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); +DEFINE_RAW_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); +static void mask_8259A(void); +static void unmask_8259A(void); +static void disable_8259A_irq(unsigned int irq); +static void enable_8259A_irq(unsigned int irq); +static void init_8259A(int auto_eoi); +static int i8259A_irq_pending(unsigned int irq); struct irq_chip i8259A_chip = { .name = "XT-PIC", @@ -63,51 +68,51 @@ unsigned int cached_irq_mask = 0xffff; */ unsigned long io_apic_irqs; -void disable_8259A_irq(unsigned int irq) +static void disable_8259A_irq(unsigned int irq) { unsigned int mask = 1 << irq; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask |= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); else outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } -void enable_8259A_irq(unsigned int irq) +static void enable_8259A_irq(unsigned int irq) { unsigned int mask = ~(1 << irq); unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask &= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); else outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } -int i8259A_irq_pending(unsigned int irq) +static int i8259A_irq_pending(unsigned int irq) { unsigned int mask = 1<<irq; unsigned long flags; int ret; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); if (irq < 8) ret = inb(PIC_MASTER_CMD) & mask; else ret = inb(PIC_SLAVE_CMD) & (mask >> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); return ret; } -void make_8259A_irq(unsigned int irq) +static void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1<<irq); @@ -150,7 +155,7 @@ static void mask_and_ack_8259A(unsigned int irq) unsigned int irqmask = 1 << irq; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); /* * Lightweight spurious IRQ detection. We do not want * to overdo spurious IRQ handling - it's usually a sign @@ -183,7 +188,7 @@ handle_real_irq: outb(cached_master_mask, PIC_MASTER_IMR); outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ } - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); return; spurious_8259A_irq: @@ -281,37 +286,37 @@ static int __init i8259A_init_sysfs(void) device_initcall(i8259A_init_sysfs); -void mask_8259A(void) +static void mask_8259A(void) { unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } -void unmask_8259A(void) +static void unmask_8259A(void) { unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } -void init_8259A(int auto_eoi) +static void init_8259A(int auto_eoi) { unsigned long flags; i8259A_auto_eoi = auto_eoi; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ @@ -356,5 +361,49 @@ void init_8259A(int auto_eoi) outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } + +/* + * make i8259 a driver so that we can select pic functions at run time. the goal + * is to make x86 binary compatible among pc compatible and non-pc compatible + * platforms, such as x86 MID. + */ + +static void legacy_pic_noop(void) { }; +static void legacy_pic_uint_noop(unsigned int unused) { }; +static void legacy_pic_int_noop(int unused) { }; + +static struct irq_chip dummy_pic_chip = { + .name = "dummy pic", + .mask = legacy_pic_uint_noop, + .unmask = legacy_pic_uint_noop, + .disable = legacy_pic_uint_noop, + .mask_ack = legacy_pic_uint_noop, +}; +static int legacy_pic_irq_pending_noop(unsigned int irq) +{ + return 0; +} + +struct legacy_pic null_legacy_pic = { + .nr_legacy_irqs = 0, + .chip = &dummy_pic_chip, + .mask_all = legacy_pic_noop, + .restore_mask = legacy_pic_noop, + .init = legacy_pic_int_noop, + .irq_pending = legacy_pic_irq_pending_noop, + .make_irq = legacy_pic_uint_noop, +}; + +struct legacy_pic default_legacy_pic = { + .nr_legacy_irqs = NR_IRQS_LEGACY, + .chip = &i8259A_chip, + .mask_all = mask_8259A, + .restore_mask = unmask_8259A, + .init = init_8259A, + .irq_pending = i8259A_irq_pending, + .make_irq = make_8259A_irq, +}; + +struct legacy_pic *legacy_pic = &default_legacy_pic; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index d5932226614..0ed2d300cd4 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -5,7 +5,6 @@ #include <linux/ioport.h> #include <linux/interrupt.h> #include <linux/timex.h> -#include <linux/slab.h> #include <linux/random.h> #include <linux/kprobes.h> #include <linux/init.h> @@ -84,24 +83,7 @@ static struct irqaction irq2 = { }; DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 + [0 ... NR_VECTORS - 1] = -1, }; int vector_used_by_percpu_irq(unsigned int vector) @@ -123,12 +105,12 @@ void __init init_ISA_irqs(void) #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) init_bsp_APIC(); #endif - init_8259A(0); + legacy_pic->init(0); /* * 16 old-style INTA-cycle interrupts: */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { + for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) { struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; @@ -142,9 +124,44 @@ void __init init_ISA_irqs(void) void __init init_IRQ(void) { + int i; + + /* + * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. + * If these IRQ's are handled by legacy interrupt-controllers like PIC, + * then this configuration will likely be static after the boot. If + * these IRQ's are handled by more mordern controllers like IO-APIC, + * then this vector space can be freed and re-used dynamically as the + * irq's migrate etc. + */ + for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) + per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i; + x86_init.irqs.intr_init(); } +/* + * Setup the vector to irq mappings. + */ +void setup_vector_irq(int cpu) +{ +#ifndef CONFIG_X86_IO_APIC + int irq; + + /* + * On most of the platforms, legacy PIC delivers the interrupts on the + * boot cpu. But there are certain platforms where PIC interrupts are + * delivered to multiple cpu's. If the legacy IRQ is handled by the + * legacy PIC, for the new cpu that is coming online, setup the static + * legacy vector to irq mapping: + */ + for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++) + per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq; +#endif + + __setup_vector_irq(cpu); +} + static void __init smp_intr_init(void) { #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c index cbc4332a77b..0f7bc20cfcd 100644 --- a/arch/x86/kernel/k8.c +++ b/arch/x86/kernel/k8.c @@ -2,8 +2,8 @@ * Shared support code for AMD K8 northbridges and derivates. * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. */ -#include <linux/gfp.h> #include <linux/types.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/module.h> @@ -121,3 +121,17 @@ void k8_flush_garts(void) } EXPORT_SYMBOL_GPL(k8_flush_garts); +static __init int init_k8_nbs(void) +{ + int err = 0; + + err = cache_k8_northbridges(); + + if (err < 0) + printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n"); + + return err; +} + +/* This has to go after the PCI subsystem */ +fs_initcall(init_k8_nbs); diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index e444357375c..8afd9f321f1 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -9,6 +9,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/stat.h> #include <linux/io.h> diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index dd74fe7273b..b2258ca9100 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -42,6 +42,7 @@ #include <linux/init.h> #include <linux/smp.h> #include <linux/nmi.h> +#include <linux/hw_breakpoint.h> #include <asm/debugreg.h> #include <asm/apicdef.h> @@ -204,40 +205,81 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) static struct hw_breakpoint { unsigned enabled; - unsigned type; - unsigned len; unsigned long addr; + int len; + int type; + struct perf_event **pev; } breakinfo[4]; static void kgdb_correct_hw_break(void) { - unsigned long dr7; - int correctit = 0; - int breakbit; int breakno; - get_debugreg(dr7, 7); for (breakno = 0; breakno < 4; breakno++) { - breakbit = 2 << (breakno << 1); - if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { - correctit = 1; - dr7 |= breakbit; - dr7 &= ~(0xf0000 << (breakno << 2)); - dr7 |= ((breakinfo[breakno].len << 2) | - breakinfo[breakno].type) << - ((breakno << 2) + 16); - set_debugreg(breakinfo[breakno].addr, breakno); - - } else { - if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { - correctit = 1; - dr7 &= ~breakbit; - dr7 &= ~(0xf0000 << (breakno << 2)); - } - } + struct perf_event *bp; + struct arch_hw_breakpoint *info; + int val; + int cpu = raw_smp_processor_id(); + if (!breakinfo[breakno].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu); + info = counter_arch_bp(bp); + if (bp->attr.disabled != 1) + continue; + bp->attr.bp_addr = breakinfo[breakno].addr; + bp->attr.bp_len = breakinfo[breakno].len; + bp->attr.bp_type = breakinfo[breakno].type; + info->address = breakinfo[breakno].addr; + info->len = breakinfo[breakno].len; + info->type = breakinfo[breakno].type; + val = arch_install_hw_breakpoint(bp); + if (!val) + bp->attr.disabled = 0; + } + hw_breakpoint_restore(); +} + +static int hw_break_reserve_slot(int breakno) +{ + int cpu; + int cnt = 0; + struct perf_event **pevent; + + for_each_online_cpu(cpu) { + cnt++; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_reserve_bp_slot(*pevent)) + goto fail; + } + + return 0; + +fail: + for_each_online_cpu(cpu) { + cnt--; + if (!cnt) + break; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + dbg_release_bp_slot(*pevent); } - if (correctit) - set_debugreg(dr7, 7); + return -1; +} + +static int hw_break_release_slot(int breakno) +{ + struct perf_event **pevent; + int cpu; + + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_release_bp_slot(*pevent)) + /* + * The debugger is responisble for handing the retry on + * remove failure. + */ + return -1; + } + return 0; } static int @@ -251,6 +293,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) if (i == 4) return -1; + if (hw_break_release_slot(i)) { + printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr); + return -1; + } breakinfo[i].enabled = 0; return 0; @@ -259,15 +305,23 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) static void kgdb_remove_all_hw_break(void) { int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; - for (i = 0; i < 4; i++) - memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint)); + for (i = 0; i < 4; i++) { + if (!breakinfo[i].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled == 1) + continue; + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + } } static int kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { - unsigned type; int i; for (i = 0; i < 4; i++) @@ -278,27 +332,42 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) switch (bptype) { case BP_HARDWARE_BREAKPOINT: - type = 0; - len = 1; + len = 1; + breakinfo[i].type = X86_BREAKPOINT_EXECUTE; break; case BP_WRITE_WATCHPOINT: - type = 1; + breakinfo[i].type = X86_BREAKPOINT_WRITE; break; case BP_ACCESS_WATCHPOINT: - type = 3; + breakinfo[i].type = X86_BREAKPOINT_RW; break; default: return -1; } - - if (len == 1 || len == 2 || len == 4) - breakinfo[i].len = len - 1; - else + switch (len) { + case 1: + breakinfo[i].len = X86_BREAKPOINT_LEN_1; + break; + case 2: + breakinfo[i].len = X86_BREAKPOINT_LEN_2; + break; + case 4: + breakinfo[i].len = X86_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case 8: + breakinfo[i].len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: return -1; - - breakinfo[i].enabled = 1; + } breakinfo[i].addr = addr; - breakinfo[i].type = type; + if (hw_break_reserve_slot(i)) { + breakinfo[i].addr = 0; + return -1; + } + breakinfo[i].enabled = 1; return 0; } @@ -313,8 +382,21 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) */ void kgdb_disable_hw_debug(struct pt_regs *regs) { + int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; + /* Disable hardware debugging while we are in kgdb: */ set_debugreg(0UL, 7); + for (i = 0; i < 4; i++) { + if (!breakinfo[i].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled == 1) + continue; + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + } } /** @@ -378,7 +460,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, struct pt_regs *linux_regs) { unsigned long addr; - unsigned long dr6; char *ptr; int newPC; @@ -404,20 +485,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, raw_smp_processor_id()); } - get_debugreg(dr6, 6); - if (!(dr6 & 0x4000)) { - int breakno; - - for (breakno = 0; breakno < 4; breakno++) { - if (dr6 & (1 << breakno) && - breakinfo[breakno].type == 0) { - /* Set restore flag: */ - linux_regs->flags |= X86_EFLAGS_RF; - break; - } - } - } - set_debugreg(0UL, 6); kgdb_correct_hw_break(); return 0; @@ -485,8 +552,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) break; case DIE_DEBUG: - if (atomic_read(&kgdb_cpu_doing_single_step) == - raw_smp_processor_id()) { + if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { if (user_mode(regs)) return single_step_cont(regs, args); break; @@ -539,7 +605,42 @@ static struct notifier_block kgdb_notifier = { */ int kgdb_arch_init(void) { - return register_die_notifier(&kgdb_notifier); + int i, cpu; + int ret; + struct perf_event_attr attr; + struct perf_event **pevent; + + ret = register_die_notifier(&kgdb_notifier); + if (ret != 0) + return ret; + /* + * Pre-allocate the hw breakpoint structions in the non-atomic + * portion of kgdb because this operation requires mutexs to + * complete. + */ + hw_breakpoint_init(&attr); + attr.bp_addr = (unsigned long)kgdb_arch_init; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_W; + attr.disabled = 1; + for (i = 0; i < 4; i++) { + breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); + if (IS_ERR(breakinfo[i].pev)) { + printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n"); + breakinfo[i].pev = NULL; + kgdb_arch_exit(); + return -1; + } + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[i].pev, cpu); + pevent[0]->hw.sample_period = 1; + if (pevent[0]->destroy != NULL) { + pevent[0]->destroy = NULL; + release_bp_slot(*pevent); + } + } + } + return ret; } /** @@ -550,6 +651,13 @@ int kgdb_arch_init(void) */ void kgdb_arch_exit(void) { + int i; + for (i = 0; i < 4; i++) { + if (breakinfo[i].pev) { + unregister_wide_hw_breakpoint(breakinfo[i].pev); + breakinfo[i].pev = NULL; + } + } unregister_die_notifier(&kgdb_notifier); } diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 5b8c7505b3b..b43bbaebe2c 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -49,6 +49,7 @@ #include <linux/module.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> +#include <linux/ftrace.h> #include <asm/cacheflush.h> #include <asm/desc.h> @@ -106,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { }; const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); -/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -static void __kprobes set_jmp_op(void *from, void *to) +static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) { - struct __arch_jmp_op { - char op; + struct __arch_relative_insn { + u8 op; s32 raddr; - } __attribute__((packed)) * jop; - jop = (struct __arch_jmp_op *)from; - jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); - jop->op = RELATIVEJUMP_INSTRUCTION; + } __attribute__((packed)) *insn; + + insn = (struct __arch_relative_insn *)from; + insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); + insn->op = op; +} + +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ +static void __kprobes synthesize_reljump(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); } /* @@ -202,7 +209,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) /* * Basically, kp->ainsn.insn has an original instruction. * However, RIP-relative instruction can not do single-stepping - * at different place, fix_riprel() tweaks the displacement of + * at different place, __copy_instruction() tweaks the displacement of * that instruction. In that case, we can't recover the instruction * from the kp->ainsn.insn. * @@ -284,21 +291,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) } /* - * Adjust the displacement if the instruction uses the %rip-relative - * addressing mode. + * Copy an instruction and adjust the displacement if the instruction + * uses the %rip-relative addressing mode. * If it does, Return the address of the 32-bit displacement word. * If not, return null. * Only applicable to 64-bit x86. */ -static void __kprobes fix_riprel(struct kprobe *p) +static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) { -#ifdef CONFIG_X86_64 struct insn insn; - kernel_insn_init(&insn, p->ainsn.insn); + int ret; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + kernel_insn_init(&insn, src); + if (recover) { + insn_get_opcode(&insn); + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, + (unsigned long)src); + if (ret) + return 0; + kernel_insn_init(&insn, buf); + } + } + insn_get_length(&insn); + memcpy(dest, insn.kaddr, insn.length); + +#ifdef CONFIG_X86_64 if (insn_rip_relative(&insn)) { s64 newdisp; u8 *disp; + kernel_insn_init(&insn, dest); insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing @@ -312,20 +335,23 @@ static void __kprobes fix_riprel(struct kprobe *p) * extension of the original signed 32-bit displacement would * have given. */ - newdisp = (u8 *) p->addr + (s64) insn.displacement.value - - (u8 *) p->ainsn.insn; + newdisp = (u8 *) src + (s64) insn.displacement.value - + (u8 *) dest; BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ - disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); + disp = (u8 *) dest + insn_offset_displacement(&insn); *(s32 *) disp = (s32) newdisp; } #endif + return insn.length; } static void __kprobes arch_copy_kprobe(struct kprobe *p) { - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - - fix_riprel(p); + /* + * Copy an instruction without recovering int3, because it will be + * put by another subsystem. + */ + __copy_instruction(p->ainsn.insn, p->addr, 0); if (can_boost(p->addr)) p->ainsn.boostable = 0; @@ -337,6 +363,9 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) int __kprobes arch_prepare_kprobe(struct kprobe *p) { + if (alternatives_text_reserved(p->addr, p->addr)) + return -EINVAL; + if (!can_probe((unsigned long)p->addr)) return -EILSEQ; /* insn: must be on special executable page on x86. */ @@ -403,18 +432,6 @@ static void __kprobes restore_btf(void) update_debugctlmsr(current->thread.debugctlmsr); } -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ - clear_btf(); - regs->flags |= X86_EFLAGS_TF; - regs->flags &= ~X86_EFLAGS_IF; - /* single step inline if the instruction is an int3 */ - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->ip = (unsigned long)p->addr; - else - regs->ip = (unsigned long)p->ainsn.insn; -} - void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -426,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, *sara = (unsigned long) &kretprobe_trampoline; } +#ifdef CONFIG_OPTPROBES +static int __kprobes setup_detour_execution(struct kprobe *p, + struct pt_regs *regs, + int reenter); +#else +#define setup_detour_execution(p, regs, reenter) (0) +#endif + static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) + struct kprobe_ctlblk *kcb, int reenter) { -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER) + if (setup_detour_execution(p, regs, reenter)) + return; + +#if !defined(CONFIG_PREEMPT) if (p->ainsn.boostable == 1 && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ - reset_current_kprobe(); + if (!reenter) + reset_current_kprobe(); + /* + * Reentering boosted probe doesn't reset current_kprobe, + * nor set current_kprobe, because it doesn't use single + * stepping. + */ regs->ip = (unsigned long)p->ainsn.insn; preempt_enable_no_resched(); return; } #endif - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_HIT_SS; + if (reenter) { + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_REENTER; + } else + kcb->kprobe_status = KPROBE_HIT_SS; + /* Prepare real single stepping */ + clear_btf(); + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + /* single step inline if the instruction is an int3 */ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->ip = (unsigned long)p->addr; + else + regs->ip = (unsigned long)p->ainsn.insn; } /* @@ -453,11 +500,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: case KPROBE_HIT_ACTIVE: - save_previous_kprobe(kcb); - set_current_kprobe(p, regs, kcb); kprobes_inc_nmissed_count(p); - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_REENTER; + setup_singlestep(p, regs, kcb, 1); break; case KPROBE_HIT_SS: /* A probe has been hit in the codepath leading up to, or just @@ -532,13 +576,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) * more here. */ if (!p->pre_handler || !p->pre_handler(p, regs)) - setup_singlestep(p, regs, kcb); + setup_singlestep(p, regs, kcb, 0); return 1; } } else if (kprobe_running()) { p = __get_cpu_var(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { - setup_singlestep(p, regs, kcb); + setup_singlestep(p, regs, kcb, 0); return 1; } } /* else: not a kprobe fault; let the kernel handle it */ @@ -547,6 +591,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) return 0; } +#ifdef CONFIG_X86_64 +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax. */ \ + " subq $24, %rsp\n" \ + " pushq %rdi\n" \ + " pushq %rsi\n" \ + " pushq %rdx\n" \ + " pushq %rcx\n" \ + " pushq %rax\n" \ + " pushq %r8\n" \ + " pushq %r9\n" \ + " pushq %r10\n" \ + " pushq %r11\n" \ + " pushq %rbx\n" \ + " pushq %rbp\n" \ + " pushq %r12\n" \ + " pushq %r13\n" \ + " pushq %r14\n" \ + " pushq %r15\n" +#define RESTORE_REGS_STRING \ + " popq %r15\n" \ + " popq %r14\n" \ + " popq %r13\n" \ + " popq %r12\n" \ + " popq %rbp\n" \ + " popq %rbx\n" \ + " popq %r11\n" \ + " popq %r10\n" \ + " popq %r9\n" \ + " popq %r8\n" \ + " popq %rax\n" \ + " popq %rcx\n" \ + " popq %rdx\n" \ + " popq %rsi\n" \ + " popq %rdi\n" \ + /* Skip orig_ax, ip, cs */ \ + " addq $24, %rsp\n" +#else +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax and gs. */ \ + " subl $16, %esp\n" \ + " pushl %fs\n" \ + " pushl %ds\n" \ + " pushl %es\n" \ + " pushl %eax\n" \ + " pushl %ebp\n" \ + " pushl %edi\n" \ + " pushl %esi\n" \ + " pushl %edx\n" \ + " pushl %ecx\n" \ + " pushl %ebx\n" +#define RESTORE_REGS_STRING \ + " popl %ebx\n" \ + " popl %ecx\n" \ + " popl %edx\n" \ + " popl %esi\n" \ + " popl %edi\n" \ + " popl %ebp\n" \ + " popl %eax\n" \ + /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ + " addl $24, %esp\n" +#endif + /* * When a retprobed function returns, this code saves registers and * calls trampoline_handler() runs, which calls the kretprobe's handler. @@ -560,65 +667,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void) /* We don't bother saving the ss register */ " pushq %rsp\n" " pushfq\n" - /* - * Skip cs, ip, orig_ax. - * trampoline_handler() will plug in these values - */ - " subq $24, %rsp\n" - " pushq %rdi\n" - " pushq %rsi\n" - " pushq %rdx\n" - " pushq %rcx\n" - " pushq %rax\n" - " pushq %r8\n" - " pushq %r9\n" - " pushq %r10\n" - " pushq %r11\n" - " pushq %rbx\n" - " pushq %rbp\n" - " pushq %r12\n" - " pushq %r13\n" - " pushq %r14\n" - " pushq %r15\n" + SAVE_REGS_STRING " movq %rsp, %rdi\n" " call trampoline_handler\n" /* Replace saved sp with true return address. */ " movq %rax, 152(%rsp)\n" - " popq %r15\n" - " popq %r14\n" - " popq %r13\n" - " popq %r12\n" - " popq %rbp\n" - " popq %rbx\n" - " popq %r11\n" - " popq %r10\n" - " popq %r9\n" - " popq %r8\n" - " popq %rax\n" - " popq %rcx\n" - " popq %rdx\n" - " popq %rsi\n" - " popq %rdi\n" - /* Skip orig_ax, ip, cs */ - " addq $24, %rsp\n" + RESTORE_REGS_STRING " popfq\n" #else " pushf\n" - /* - * Skip cs, ip, orig_ax and gs. - * trampoline_handler() will plug in these values - */ - " subl $16, %esp\n" - " pushl %fs\n" - " pushl %es\n" - " pushl %ds\n" - " pushl %eax\n" - " pushl %ebp\n" - " pushl %edi\n" - " pushl %esi\n" - " pushl %edx\n" - " pushl %ecx\n" - " pushl %ebx\n" + SAVE_REGS_STRING " movl %esp, %eax\n" " call trampoline_handler\n" /* Move flags to cs */ @@ -626,15 +684,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void) " movl %edx, 52(%esp)\n" /* Replace saved flags with true return address. */ " movl %eax, 56(%esp)\n" - " popl %ebx\n" - " popl %ecx\n" - " popl %edx\n" - " popl %esi\n" - " popl %edi\n" - " popl %ebp\n" - " popl %eax\n" - /* Skip ds, es, fs, gs, orig_ax and ip */ - " addl $24, %esp\n" + RESTORE_REGS_STRING " popf\n" #endif " ret\n"); @@ -802,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p, * These instructions can be executed directly if it * jumps back to correct address. */ - set_jmp_op((void *)regs->ip, - (void *)orig_ip + (regs->ip - copy_ip)); + synthesize_reljump((void *)regs->ip, + (void *)orig_ip + (regs->ip - copy_ip)); p->ainsn.boostable = 1; } else { p->ainsn.boostable = -1; @@ -1030,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } + +#ifdef CONFIG_OPTPROBES + +/* Insert a call instruction at address 'from', which calls address 'to'.*/ +static void __kprobes synthesize_relcall(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); +} + +/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ +static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, + unsigned long val) +{ +#ifdef CONFIG_X86_64 + *addr++ = 0x48; + *addr++ = 0xbf; +#else + *addr++ = 0xb8; +#endif + *(unsigned long *)addr = val; +} + +void __kprobes kprobes_optinsn_template_holder(void) +{ + asm volatile ( + ".global optprobe_template_entry\n" + "optprobe_template_entry: \n" +#ifdef CONFIG_X86_64 + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rsi\n" + ".global optprobe_template_val\n" + "optprobe_template_val: \n" + ASM_NOP5 + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call: \n" + ASM_NOP5 + /* Move flags to rsp */ + " movq 144(%rsp), %rdx\n" + " movq %rdx, 152(%rsp)\n" + RESTORE_REGS_STRING + /* Skip flags entry */ + " addq $8, %rsp\n" + " popfq\n" +#else /* CONFIG_X86_32 */ + " pushf\n" + SAVE_REGS_STRING + " movl %esp, %edx\n" + ".global optprobe_template_val\n" + "optprobe_template_val: \n" + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call: \n" + ASM_NOP5 + RESTORE_REGS_STRING + " addl $4, %esp\n" /* skip cs */ + " popf\n" +#endif + ".global optprobe_template_end\n" + "optprobe_template_end: \n"); +} + +#define TMPL_MOVE_IDX \ + ((long)&optprobe_template_val - (long)&optprobe_template_entry) +#define TMPL_CALL_IDX \ + ((long)&optprobe_template_call - (long)&optprobe_template_entry) +#define TMPL_END_IDX \ + ((long)&optprobe_template_end - (long)&optprobe_template_entry) + +#define INT3_SIZE sizeof(kprobe_opcode_t) + +/* Optimized kprobe call back function: called from optinsn */ +static void __kprobes optimized_callback(struct optimized_kprobe *op, + struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + preempt_disable(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(&op->kp); + } else { + /* Save skipped registers */ +#ifdef CONFIG_X86_64 + regs->cs = __KERNEL_CS; +#else + regs->cs = __KERNEL_CS | get_kernel_rpl(); + regs->gs = 0; +#endif + regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; + regs->orig_ax = ~0UL; + + __get_cpu_var(current_kprobe) = &op->kp; + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + opt_pre_handler(&op->kp, regs); + __get_cpu_var(current_kprobe) = NULL; + } + preempt_enable_no_resched(); +} + +static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) +{ + int len = 0, ret; + + while (len < RELATIVEJUMP_SIZE) { + ret = __copy_instruction(dest + len, src + len, 1); + if (!ret || !can_boost(dest + len)) + return -EINVAL; + len += ret; + } + /* Check whether the address range is reserved */ + if (ftrace_text_reserved(src, src + len - 1) || + alternatives_text_reserved(src, src + len - 1)) + return -EBUSY; + + return len; +} + +/* Check whether insn is indirect jump */ +static int __kprobes insn_is_indirect_jump(struct insn *insn) +{ + return ((insn->opcode.bytes[0] == 0xff && + (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ + insn->opcode.bytes[0] == 0xea); /* Segment based jump */ +} + +/* Check whether insn jumps into specified address range */ +static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) +{ + unsigned long target = 0; + + switch (insn->opcode.bytes[0]) { + case 0xe0: /* loopne */ + case 0xe1: /* loope */ + case 0xe2: /* loop */ + case 0xe3: /* jcxz */ + case 0xe9: /* near relative jump */ + case 0xeb: /* short relative jump */ + break; + case 0x0f: + if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ + break; + return 0; + default: + if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ + break; + return 0; + } + target = (unsigned long)insn->next_byte + insn->immediate.value; + + return (start <= target && target <= start + len); +} + +/* Decode whole function to ensure any instructions don't jump into target */ +static int __kprobes can_optimize(unsigned long paddr) +{ + int ret; + unsigned long addr, size = 0, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + /* Dummy buffers for lookup_symbol_attrs */ + static char __dummy_buf[KSYM_NAME_LEN]; + + /* Lookup symbol including addr */ + if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf)) + return 0; + + /* Check there is enough space for a relative jump. */ + if (size - offset < RELATIVEJUMP_SIZE) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr - offset + size) { /* Decode until function end */ + if (search_exception_tables(addr)) + /* + * Since some fixup code will jumps into this function, + * we can't optimize kprobe in this function. + */ + return 0; + kernel_insn_init(&insn, (void *)addr); + insn_get_opcode(&insn); + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, addr); + if (ret) + return 0; + kernel_insn_init(&insn, buf); + } + insn_get_length(&insn); + /* Recover address */ + insn.kaddr = (void *)addr; + insn.next_byte = (void *)(addr + insn.length); + /* Check any instructions don't jump into target */ + if (insn_is_indirect_jump(&insn) || + insn_jump_into_range(&insn, paddr + INT3_SIZE, + RELATIVE_ADDR_SIZE)) + return 0; + addr += insn.length; + } + + return 1; +} + +/* Check optimized_kprobe can actually be optimized. */ +int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) +{ + int i; + struct kprobe *p; + + for (i = 1; i < op->optinsn.size; i++) { + p = get_kprobe(op->kp.addr + i); + if (p && !kprobe_disabled(p)) + return -EEXIST; + } + + return 0; +} + +/* Check the addr is within the optimized instructions. */ +int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op, + unsigned long addr) +{ + return ((unsigned long)op->kp.addr <= addr && + (unsigned long)op->kp.addr + op->optinsn.size > addr); +} + +/* Free optimized instruction slot */ +static __kprobes +void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) +{ + if (op->optinsn.insn) { + free_optinsn_slot(op->optinsn.insn, dirty); + op->optinsn.insn = NULL; + op->optinsn.size = 0; + } +} + +void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) +{ + __arch_remove_optimized_kprobe(op, 1); +} + +/* + * Copy replacing target instructions + * Target instructions MUST be relocatable (checked inside) + */ +int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +{ + u8 *buf; + int ret; + long rel; + + if (!can_optimize((unsigned long)op->kp.addr)) + return -EILSEQ; + + op->optinsn.insn = get_optinsn_slot(); + if (!op->optinsn.insn) + return -ENOMEM; + + /* + * Verify if the address gap is in 2GB range, because this uses + * a relative jump. + */ + rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; + if (abs(rel) > 0x7fffffff) + return -ERANGE; + + buf = (u8 *)op->optinsn.insn; + + /* Copy instructions into the out-of-line buffer */ + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); + if (ret < 0) { + __arch_remove_optimized_kprobe(op, 0); + return ret; + } + op->optinsn.size = ret; + + /* Copy arch-dep-instance from template */ + memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + + /* Set probe information */ + synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); + + /* Set probe function call */ + synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); + + /* Set returning jmp instruction at the tail of out-of-line buffer */ + synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, + (u8 *)op->kp.addr + op->optinsn.size); + + flush_icache_range((unsigned long) buf, + (unsigned long) buf + TMPL_END_IDX + + op->optinsn.size + RELATIVEJUMP_SIZE); + return 0; +} + +/* Replace a breakpoint (int3) with a relative jump. */ +int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) +{ + unsigned char jmp_code[RELATIVEJUMP_SIZE]; + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, + RELATIVE_ADDR_SIZE); + + jmp_code[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&jmp_code[1]) = rel; + + /* + * text_poke_smp doesn't support NMI/MCE code modifying. + * However, since kprobes itself also doesn't support NMI/MCE + * code probing, it's not a problem. + */ + text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); + return 0; +} + +/* Replace a relative jump with a breakpoint (int3). */ +void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) +{ + u8 buf[RELATIVEJUMP_SIZE]; + + /* Set int3 to first byte for kprobes */ + buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); +} + +static int __kprobes setup_detour_execution(struct kprobe *p, + struct pt_regs *regs, + int reenter) +{ + struct optimized_kprobe *op; + + if (p->flags & KPROBE_FLAG_OPTIMIZED) { + /* This kprobe is really able to run optimized path. */ + op = container_of(p, struct optimized_kprobe, kp); + /* Detour through copied instructions */ + regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; + if (!reenter) + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + return 0; +} +#endif + int __init arch_init_kprobes(void) { return 0; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ec6ef60cbd1..ea697263b37 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -7,6 +7,7 @@ */ #include <linux/errno.h> +#include <linux/gfp.h> #include <linux/sched.h> #include <linux/string.h> #include <linux/mm.h> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 4a8bb82248a..035c8c52918 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/kexec.h> #include <linux/string.h> +#include <linux/gfp.h> #include <linux/reboot.h> #include <linux/numa.h> #include <linux/ftrace.h> diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c index 845d80ce1ef..63eaf659623 100644 --- a/arch/x86/kernel/mca_32.c +++ b/arch/x86/kernel/mca_32.c @@ -42,6 +42,7 @@ #include <linux/kernel.h> #include <linux/mca.h> #include <linux/kprobes.h> +#include <linux/slab.h> #include <asm/system.h> #include <asm/io.h> #include <linux/proc_fs.h> diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 37542b67c57..e1af7c055c7 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -36,9 +36,6 @@ MODULE_LICENSE("GPL v2"); #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 #define UCODE_UCODE_TYPE 0x00000001 -const struct firmware *firmware; -static int supported_cpu; - struct equiv_cpu_entry { u32 installed_cpu; u32 fixed_errata_mask; @@ -77,12 +74,15 @@ static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { + struct cpuinfo_x86 *c = &cpu_data(cpu); u32 dummy; - if (!supported_cpu) - return -1; - memset(csig, 0, sizeof(*csig)); + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { + pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " + "supported\n", cpu, c->x86); + return -1; + } rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; @@ -294,10 +294,14 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) static enum ucode_state request_microcode_fw(int cpu, struct device *device) { + const char *fw_name = "amd-ucode/microcode_amd.bin"; + const struct firmware *firmware; enum ucode_state ret; - if (firmware == NULL) + if (request_firmware(&firmware, fw_name, device)) { + printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); return UCODE_NFOUND; + } if (*(u32 *)firmware->data != UCODE_MAGIC) { pr_err("invalid UCODE_MAGIC (0x%08x)\n", @@ -307,6 +311,8 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) ret = generic_load_microcode(cpu, firmware->data, firmware->size); + release_firmware(firmware); + return ret; } @@ -325,31 +331,7 @@ static void microcode_fini_cpu_amd(int cpu) uci->mc = NULL; } -void init_microcode_amd(struct device *device) -{ - const char *fw_name = "amd-ucode/microcode_amd.bin"; - struct cpuinfo_x86 *c = &boot_cpu_data; - - WARN_ON(c->x86_vendor != X86_VENDOR_AMD); - - if (c->x86 < 0x10) { - pr_warning("AMD CPU family 0x%x not supported\n", c->x86); - return; - } - supported_cpu = 1; - - if (request_firmware(&firmware, fw_name, device)) - pr_err("failed to load file %s\n", fw_name); -} - -void fini_microcode_amd(void) -{ - release_firmware(firmware); -} - static struct microcode_ops microcode_amd_ops = { - .init = init_microcode_amd, - .fini = fini_microcode_amd, .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info_amd, diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 0c863243309..cceb5bc3c3c 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -521,9 +521,6 @@ static int __init microcode_init(void) return PTR_ERR(microcode_pdev); } - if (microcode_ops->init) - microcode_ops->init(µcode_pdev->dev); - get_online_cpus(); mutex_lock(µcode_mutex); @@ -566,9 +563,6 @@ static void __exit microcode_exit(void) platform_device_unregister(microcode_pdev); - if (microcode_ops->fini) - microcode_ops->fini(); - microcode_ops = NULL; pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index ebd193e476c..85a343e2893 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -328,7 +328,7 @@ static int apply_microcode(int cpu) cpu_num, mc_intel->hdr.rev); return -1; } - pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n", + pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", cpu_num, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index 712d15fdc41..71825806cd4 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -7,6 +7,8 @@ #include <linux/string.h> #include <linux/pci.h> #include <linux/dmi.h> +#include <linux/range.h> + #include <asm/pci-direct.h> #include <linux/sort.h> #include <asm/io.h> @@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, }; -struct range { - u64 start; - u64 end; -}; - static int __cpuinit cmp_range(const void *x1, const void *x2) { const struct range *r1 = x1; diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 89f386f044e..e0bc186d750 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -23,6 +23,7 @@ #include <linux/kernel.h> #include <linux/bug.h> #include <linux/mm.h> +#include <linux/gfp.h> #include <asm/system.h> #include <asm/page.h> diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 40b54ceb68b..e81030f71a8 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -359,13 +359,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) x86_init.mpparse.mpc_record(1); } -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif - - if (apic->setup_apic_routing) - apic->setup_apic_routing(); - if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; @@ -671,7 +664,7 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf) { unsigned long size = get_mpc_size(mpf->physptr); - reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc"); + reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc"); } static int __init smp_scan_config(unsigned long base, unsigned long length) @@ -700,7 +693,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) mpf, (u64)virt_to_phys(mpf)); mem = virt_to_phys(mpf); - reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf"); + reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf"); if (mpf->physptr) smp_reserve_memory(mpf); diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 3b7078abc87..0aad8670858 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -10,8 +10,211 @@ * of the License. */ #include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sfi.h> +#include <linux/irq.h> +#include <linux/module.h> #include <asm/setup.h> +#include <asm/mpspec_def.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/mrst.h> +#include <asm/io.h> +#include <asm/i8259.h> +#include <asm/apb_timer.h> + +static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; +static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; +int sfi_mtimer_num; + +struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; +EXPORT_SYMBOL_GPL(sfi_mrtc_array); +int sfi_mrtc_num; + +static inline void assign_to_mp_irq(struct mpc_intsrc *m, + struct mpc_intsrc *mp_irq) +{ + memcpy(mp_irq, m, sizeof(struct mpc_intsrc)); +} + +static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq, + struct mpc_intsrc *m) +{ + return memcmp(mp_irq, m, sizeof(struct mpc_intsrc)); +} + +static void save_mp_irq(struct mpc_intsrc *m) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + if (!mp_irq_cmp(&mp_irqs[i], m)) + return; + } + + assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +/* parse all the mtimer info to a static mtimer array */ +static int __init sfi_parse_mtmr(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_timer_table_entry *pentry; + struct mpc_intsrc mp_irq; + int totallen; + + sb = (struct sfi_table_simple *)table; + if (!sfi_mtimer_num) { + sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb, + struct sfi_timer_table_entry); + pentry = (struct sfi_timer_table_entry *) sb->pentry; + totallen = sfi_mtimer_num * sizeof(*pentry); + memcpy(sfi_mtimer_array, pentry, totallen); + } + + printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num); + pentry = sfi_mtimer_array; + for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) { + printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz," + " irq = %d\n", totallen, (u32)pentry->phys_addr, + pentry->freq_hz, pentry->irq); + if (!pentry->irq) + continue; + mp_irq.type = MP_IOAPIC; + mp_irq.irqtype = mp_INT; +/* triggering mode edge bit 2-3, active high polarity bit 0-1 */ + mp_irq.irqflag = 5; + mp_irq.srcbus = 0; + mp_irq.srcbusirq = pentry->irq; /* IRQ */ + mp_irq.dstapic = MP_APIC_ALL; + mp_irq.dstirq = pentry->irq; + save_mp_irq(&mp_irq); + } + + return 0; +} + +struct sfi_timer_table_entry *sfi_get_mtmr(int hint) +{ + int i; + if (hint < sfi_mtimer_num) { + if (!sfi_mtimer_usage[hint]) { + pr_debug("hint taken for timer %d irq %d\n",\ + hint, sfi_mtimer_array[hint].irq); + sfi_mtimer_usage[hint] = 1; + return &sfi_mtimer_array[hint]; + } + } + /* take the first timer available */ + for (i = 0; i < sfi_mtimer_num;) { + if (!sfi_mtimer_usage[i]) { + sfi_mtimer_usage[i] = 1; + return &sfi_mtimer_array[i]; + } + i++; + } + return NULL; +} + +void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr) +{ + int i; + for (i = 0; i < sfi_mtimer_num;) { + if (mtmr->irq == sfi_mtimer_array[i].irq) { + sfi_mtimer_usage[i] = 0; + return; + } + i++; + } +} + +/* parse all the mrtc info to a global mrtc array */ +int __init sfi_parse_mrtc(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_rtc_table_entry *pentry; + struct mpc_intsrc mp_irq; + + int totallen; + + sb = (struct sfi_table_simple *)table; + if (!sfi_mrtc_num) { + sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb, + struct sfi_rtc_table_entry); + pentry = (struct sfi_rtc_table_entry *)sb->pentry; + totallen = sfi_mrtc_num * sizeof(*pentry); + memcpy(sfi_mrtc_array, pentry, totallen); + } + + printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num); + pentry = sfi_mrtc_array; + for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { + printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n", + totallen, (u32)pentry->phys_addr, pentry->irq); + mp_irq.type = MP_IOAPIC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = 0; + mp_irq.srcbus = 0; + mp_irq.srcbusirq = pentry->irq; /* IRQ */ + mp_irq.dstapic = MP_APIC_ALL; + mp_irq.dstirq = pentry->irq; + save_mp_irq(&mp_irq); + } + return 0; +} + +/* + * the secondary clock in Moorestown can be APBT or LAPIC clock, default to + * APBT but cmdline option can also override it. + */ +static void __cpuinit mrst_setup_secondary_clock(void) +{ + /* restore default lapic clock if disabled by cmdline */ + if (disable_apbt_percpu) + return setup_secondary_APIC_clock(); + apbt_setup_secondary_clock(); +} + +static unsigned long __init mrst_calibrate_tsc(void) +{ + unsigned long flags, fast_calibrate; + + local_irq_save(flags); + fast_calibrate = apbt_quick_calibrate(); + local_irq_restore(flags); + + if (fast_calibrate) + return fast_calibrate; + + return 0; +} + +void __init mrst_time_init(void) +{ + sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); + pre_init_apic_IRQ0(); + apbt_time_init(); +} + +void __init mrst_rtc_init(void) +{ + sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); +} + +/* + * if we use per cpu apb timer, the bootclock already setup. if we use lapic + * timer and one apbt timer for broadcast, we need to set up lapic boot clock. + */ +static void __init mrst_setup_boot_clock(void) +{ + pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); + if (disable_apbt_percpu) + setup_boot_APIC_clock(); +}; /* * Moorestown specific x86_init function overrides and early setup @@ -21,4 +224,17 @@ void __init x86_mrst_early_setup(void) { x86_init.resources.probe_roms = x86_init_noop; x86_init.resources.reserve_resources = x86_init_noop; + + x86_init.timers.timer_init = mrst_time_init; + x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; + + x86_init.irqs.pre_vector_init = x86_init_noop; + + x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; + + x86_platform.calibrate_tsc = mrst_calibrate_tsc; + x86_init.pci.init = pci_mrst_init; + x86_init.pci.fixup_irqs = x86_init_noop; + + legacy_pic = &null_legacy_pic; } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 4bd93c9b2b2..4d4468e9f47 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -37,6 +37,7 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/uaccess.h> +#include <linux/gfp.h> #include <asm/processor.h> #include <asm/msr.h> @@ -285,7 +286,7 @@ static void __exit msr_exit(void) for_each_online_cpu(cpu) msr_device_destroy(cpu); class_destroy(msr_class); - unregister_chrdev(MSR_MAJOR, "cpu/msr"); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); unregister_hotcpu_notifier(&msr_class_cpu_notifier); } diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 9d1d263f786..8297160c41b 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -17,7 +17,9 @@ #include <linux/spinlock.h> #include <linux/io.h> #include <linux/string.h> + #include <asm/geode.h> +#include <asm/setup.h> #include <asm/olpc.h> #ifdef CONFIG_OPEN_FIRMWARE @@ -243,9 +245,11 @@ static int __init olpc_init(void) olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, (unsigned char *) &olpc_platform_info.ecver, 1); - /* check to see if the VSA exists */ - if (cs5535_has_vsa2()) - olpc_platform_info.flags |= OLPC_F_VSA; +#ifdef CONFIG_PCI_OLPC + /* If the VSA exists let it emulate PCI, if not emulate in kernel */ + if (!cs5535_has_vsa2()) + x86_init.pci.arch_init = pci_olpc_init; +#endif printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1b1739d1631..1db183ed7c0 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -428,10 +428,6 @@ struct pv_mmu_ops pv_mmu_ops = { .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, -#ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = kmap_atomic, -#endif - #if PAGETABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 2bbde607814..fb99f7edb34 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1309,7 +1309,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) /* * get_tce_space_from_tar(): * Function for kdump case. Get the tce tables from first kernel - * by reading the contents of the base adress register of calgary iommu + * by reading the contents of the base address register of calgary iommu */ static void __init get_tce_space_from_tar(void) { diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 75e14e21f61..4b7e3d8b01d 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -2,6 +2,7 @@ #include <linux/dma-debug.h> #include <linux/dmar.h> #include <linux/bootmem.h> +#include <linux/gfp.h> #include <linux/pci.h> #include <linux/kmemleak.h> @@ -38,7 +39,7 @@ int iommu_detected __read_mostly = 0; * This variable becomes 1 if iommu=pt is passed on the kernel command line. * If this variable is 1, IOMMU implementations do no DMA translation for * devices and allow every device to access to whole physical memory. This is - * useful if a user want to use an IOMMU only for KVM device assignment to + * useful if a user wants to use an IOMMU only for KVM device assignment to * guests and not for driver dma translation. */ int iommu_pass_through __read_mostly; @@ -65,7 +66,7 @@ int dma_set_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_mask); -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA) static __initdata void *dma32_bootmem_ptr; static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); @@ -116,14 +117,21 @@ static void __init dma32_free_bootmem(void) dma32_bootmem_ptr = NULL; dma32_bootmem_size = 0; } +#else +void __init dma32_reserve_bootmem(void) +{ +} +static void __init dma32_free_bootmem(void) +{ +} + #endif void __init pci_iommu_alloc(void) { -#ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); -#endif + if (pci_swiotlb_detect()) goto out; diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 34de53b46f8..0f7f130caa6 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -29,6 +29,7 @@ #include <linux/iommu-helper.h> #include <linux/sysdev.h> #include <linux/io.h> +#include <linux/gfp.h> #include <asm/atomic.h> #include <asm/mtrr.h> #include <asm/pgtable.h> @@ -564,6 +565,9 @@ static void enable_gart_translations(void) enable_gart_translation(dev, __pa(agp_gatt_table)); } + + /* Flush the GART-TLB to remove stale entries */ + k8_flush_garts(); } /* @@ -735,7 +739,7 @@ int __init gart_iommu_init(void) unsigned long scratch; long i; - if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) + if (num_k8_northbridges == 0) return 0; #ifndef CONFIG_AGP_AMD64 diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 22be12b60a8..3af4af810c0 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -4,6 +4,7 @@ #include <linux/scatterlist.h> #include <linux/string.h> #include <linux/init.h> +#include <linux/gfp.h> #include <linux/pci.h> #include <linux/mm.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index cf1e04b2ad6..28ad9f4d8b9 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -110,8 +110,8 @@ void show_regs_common(void) if (!product) product = ""; - printk("\n"); - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", + printk(KERN_CONT "\n"); + printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), @@ -122,18 +122,6 @@ void flush_thread(void) { struct task_struct *tsk = current; -#ifdef CONFIG_X86_64 - if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { - clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); - if (test_tsk_thread_flag(tsk, TIF_IA32)) { - clear_tsk_thread_flag(tsk, TIF_IA32); - } else { - set_tsk_thread_flag(tsk, TIF_IA32); - current_thread_info()->status |= TS_COMPAT; - } - } -#endif - flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* @@ -295,6 +283,8 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.es = __USER_DS; regs.fs = __KERNEL_PERCPU; regs.gs = __KERNEL_STACK_CANARY; +#else + regs.ss = __KERNEL_DS; #endif regs.orig_ax = -1; @@ -536,21 +526,37 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) } /* - * Check for AMD CPUs, which have potentially C1E support + * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. + * For more information see + * - Erratum #400 for NPT family 0xf and family 0x10 CPUs + * - Erratum #365 for family 0x11 (not affected because C1e not in use) */ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) { + u64 val; if (c->x86_vendor != X86_VENDOR_AMD) - return 0; - - if (c->x86 < 0x0F) - return 0; + goto no_c1e_idle; /* Family 0x0f models < rev F do not have C1E */ - if (c->x86 == 0x0f && c->x86_model < 0x40) - return 0; + if (c->x86 == 0x0F && c->x86_model >= 0x40) + return 1; - return 1; + if (c->x86 == 0x10) { + /* + * check OSVW bit for CPUs that are not affected + * by erratum #400 + */ + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); + if (val >= 2) { + rdmsrl(MSR_AMD64_OSVW_STATUS, val); + if (!(val & BIT(1))) + goto no_c1e_idle; + } + return 1; + } + +no_c1e_idle: + return 0; } static cpumask_var_t c1e_mask; @@ -617,7 +623,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP if (pm_idle == poll_idle && smp_num_siblings > 1) { - printk(KERN_WARNING "WARNING: polling idle and HT enabled," + printk_once(KERN_WARNING "WARNING: polling idle and HT enabled," " performance may degrade.\n"); } #endif diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index fe6a34e42bd..f6c62667e30 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -139,16 +139,16 @@ void __show_regs(struct pt_regs *regs, int all) show_regs_common(); - printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", + printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, smp_processor_id()); print_symbol("EIP is at %s\n", regs->ip); - printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); - printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", regs->si, regs->di, regs->bp, sp); - printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", + printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); if (!all) @@ -158,19 +158,19 @@ void __show_regs(struct pt_regs *regs, int all) cr2 = read_cr2(); cr3 = read_cr3(); cr4 = read_cr4_safe(); - printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); get_debugreg(d3, 3); - printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", d0, d1, d2, d3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk("DR6: %08lx DR7: %08lx\n", + printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n", d6, d7); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 418f860880a..dc9690b4c4c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -161,19 +161,19 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int ds, cs, es; show_regs_common(); - printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); printk_address(regs->ip, 1); - printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, + printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, regs->flags); - printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", + printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->ax, regs->bx, regs->cx); - printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", + printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", regs->dx, regs->si, regs->di); - printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", + printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n", regs->bp, regs->r8, regs->r9); - printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", + printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n", regs->r10, regs->r11, regs->r12); - printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", + printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); asm("movl %%ds,%0" : "=r" (ds)); @@ -194,21 +194,21 @@ void __show_regs(struct pt_regs *regs, int all) cr3 = read_cr3(); cr4 = read_cr4(); - printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs, fsindex, gs, gsindex, shadowgs); - printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); - printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, + printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); - printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); get_debugreg(d3, 3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); + printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); } void release_thread(struct task_struct *dead_task) @@ -515,6 +515,18 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } +void set_personality_ia32(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 32bit mode */ + set_thread_flag(TIF_IA32); + current->personality |= force_personality32; + + /* Prepare the first "return" to user space */ + current_thread_info()->status |= TS_COMPAT; +} + unsigned long get_wchan(struct task_struct *p) { unsigned long stack; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 017d937639f..2e9b55027b7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -12,6 +12,7 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/errno.h> +#include <linux/slab.h> #include <linux/ptrace.h> #include <linux/regset.h> #include <linux/tracehook.h> @@ -48,6 +49,7 @@ enum x86_regset { REGSET_FP, REGSET_XFP, REGSET_IOPERM64 = REGSET_XFP, + REGSET_XSTATE, REGSET_TLS, REGSET_IOPERM32, }; @@ -140,30 +142,6 @@ static const int arg_offs_table[] = { #endif }; -/** - * regs_get_argument_nth() - get Nth argument at function call - * @regs: pt_regs which contains registers at function entry. - * @n: argument number. - * - * regs_get_argument_nth() returns @n th argument of a function call. - * Since usually the kernel stack will be changed right after function entry, - * you must use this at function entry. If the @n th entry is NOT in the - * kernel stack or pt_regs, this returns 0. - */ -unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n) -{ - if (n < ARRAY_SIZE(arg_offs_table)) - return *(unsigned long *)((char *)regs + arg_offs_table[n]); - else { - /* - * The typical case: arg n is on the stack. - * (Note: stack[0] = return address, so skip it) - */ - n -= ARRAY_SIZE(arg_offs_table); - return regs_get_kernel_stack_nth(regs, 1 + n); - } -} - /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. @@ -604,7 +582,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, struct perf_event_attr attr; /* - * We shoud have at least an inactive breakpoint at this + * We should have at least an inactive breakpoint at this * slot. It means the user is writing dr7 without having * written the address register first */ @@ -702,7 +680,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) } else if (n == 6) { val = thread->debugreg6; } else if (n == 7) { - val = ptrace_get_dr7(thread->ptrace_bps); + val = thread->ptrace_dr7; } return val; } @@ -778,8 +756,11 @@ int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) return rc; } /* All that's left is DR7 */ - if (n == 7) + if (n == 7) { rc = ptrace_write_dr7(tsk, val); + if (!rc) + thread->ptrace_dr7 = val; + } ret_path: return rc; @@ -1584,7 +1565,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, #ifdef CONFIG_X86_64 -static const struct user_regset x86_64_regsets[] = { +static struct user_regset x86_64_regsets[] __read_mostly = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct) / sizeof(long), @@ -1597,6 +1578,12 @@ static const struct user_regset x86_64_regsets[] = { .size = sizeof(long), .align = sizeof(long), .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), .align = sizeof(u64), + .active = xstateregs_active, .get = xstateregs_get, + .set = xstateregs_set + }, [REGSET_IOPERM64] = { .core_note_type = NT_386_IOPERM, .n = IO_BITMAP_LONGS, @@ -1622,7 +1609,7 @@ static const struct user_regset_view user_x86_64_view = { #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION -static const struct user_regset x86_32_regsets[] = { +static struct user_regset x86_32_regsets[] __read_mostly = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct32) / sizeof(u32), @@ -1641,6 +1628,12 @@ static const struct user_regset x86_32_regsets[] = { .size = sizeof(u32), .align = sizeof(u32), .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), .align = sizeof(u64), + .active = xstateregs_active, .get = xstateregs_get, + .set = xstateregs_set + }, [REGSET_TLS] = { .core_note_type = NT_386_TLS, .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, @@ -1663,6 +1656,23 @@ static const struct user_regset_view user_x86_32_view = { }; #endif +/* + * This represents bytes 464..511 in the memory layout exported through + * the REGSET_XSTATE interface. + */ +u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; + +void update_regset_xstate_info(unsigned int size, u64 xstate_mask) +{ +#ifdef CONFIG_X86_64 + x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64); +#endif +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64); +#endif + xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; +} + const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 18093d7498f..12e9feaa2f7 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -491,6 +491,19 @@ void force_hpet_resume(void) break; } } + +/* + * HPET MSI on some boards (ATI SB700/SB800) has side effect on + * floppy DMA. Disable HPET MSI on such platforms. + */ +static void force_disable_hpet_msi(struct pci_dev *unused) +{ + hpet_msi_disable = 1; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + force_disable_hpet_msi); + #endif #if defined(CONFIG_PCI) && defined(CONFIG_NUMA) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 1545bc0c984..8e1aac86b50 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -203,6 +203,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0T656F"), }, }, + { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 760", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"), + DMI_MATCH(DMI_BOARD_NAME, "0G919G"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", @@ -452,6 +461,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), }, }, + { /* Handle problems with rebooting on the iMac9,1. */ + .callback = set_pci_reboot, + .ident = "Apple iMac9,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), + }, + }, { } }; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f7b8b9894b2..c4851eff57b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -55,7 +55,6 @@ #include <linux/stddef.h> #include <linux/unistd.h> #include <linux/ptrace.h> -#include <linux/slab.h> #include <linux/user.h> #include <linux/delay.h> @@ -121,7 +120,9 @@ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; +#ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); +#endif unsigned int boot_cpu_id __read_mostly; @@ -312,16 +313,17 @@ static void __init reserve_brk(void) #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { - + /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 area_size = PAGE_ALIGN(ramdisk_size); u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; /* We need to move the initrd down into lowmem */ - ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size, + ramdisk_here = find_e820_area(0, end_of_lowmem, area_size, PAGE_SIZE); if (ramdisk_here == -1ULL) @@ -330,7 +332,7 @@ static void __init relocate_initrd(void) /* Note: this includes all the lowmem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - reserve_early(ramdisk_here, ramdisk_here + ramdisk_size, + reserve_early(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK"); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; @@ -374,9 +376,10 @@ static void __init relocate_initrd(void) static void __init reserve_initrd(void) { + /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 ramdisk_end = ramdisk_image + ramdisk_size; + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; if (!boot_params.hdr.type_of_loader || @@ -604,6 +607,16 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif +static __init void reserve_ibft_region(void) +{ + unsigned long addr, size = 0; + + addr = find_ibft_region(&size); + + if (size) + reserve_early_overlap_ok(addr, addr + size, "ibft"); +} + #ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { @@ -642,23 +655,48 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), }, }, - { /* - * AMI BIOS with low memory corruption was found on Intel DG45ID board. - * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will + * AMI BIOS with low memory corruption was found on Intel DG45ID and + * DG45FC boards. + * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will * match only DMI_BOARD_NAME and see if there is more bad products * with this vendor. */ + { .callback = dmi_low_memory_corruption, .ident = "AMI BIOS", .matches = { DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), }, }, + { + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), + }, + }, #endif {} }; +static void __init trim_bios_range(void) +{ + /* + * A special case is the first 4Kb of memory; + * This is a BIOS owned area, not kernel ram, but generally + * not listed as such in the E820 table. + */ + e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); + /* + * special case: Some BIOSen report the PC BIOS + * area (640->1Mb) as ram even though it is not. + * take them out. + */ + e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -822,7 +860,7 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); - + trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, @@ -881,6 +919,8 @@ void __init setup_arch(char **cmdline_p) */ find_smp_config(); + reserve_ibft_region(); + reserve_trampoline_memory(); #ifdef CONFIG_ACPI_SLEEP @@ -942,17 +982,11 @@ void __init setup_arch(char **cmdline_p) #endif initmem_init(0, max_pfn, acpi, k8); - -#ifdef CONFIG_X86_64 - /* - * dma32_reserve_bootmem() allocates bootmem which may conflict - * with the crashkernel command line, so do that after - * reserve_crashkernel() - */ - dma32_reserve_bootmem(); +#ifndef CONFIG_NO_BOOTMEM + early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); #endif - reserve_ibft_region(); + dma32_reserve_bootmem(); #ifdef CONFIG_KVM_CLOCK kvmclock_init(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 35abcb8b00e..ef6370b00e7 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -137,7 +137,13 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) static void __init pcpu_fc_free(void *ptr, size_t size) { +#ifdef CONFIG_NO_BOOTMEM + u64 start = __pa(ptr); + u64 end = start + size; + free_early_partial(start, end); +#else free_bootmem(__pa(ptr), size); +#endif } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index ec1de97600e..d801210945d 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -21,6 +21,7 @@ #include <linux/cache.h> #include <linux/interrupt.h> #include <linux/cpu.h> +#include <linux/gfp.h> #include <asm/mtrr.h> #include <asm/tlbflush.h> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 678d0b8c26f..763d815e27a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -48,6 +48,8 @@ #include <linux/err.h> #include <linux/nmi.h> #include <linux/tboot.h> +#include <linux/stackprotector.h> +#include <linux/gfp.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -67,6 +69,7 @@ #include <linux/mc146818rtc.h> #include <asm/smpboot_hooks.h> +#include <asm/i8259.h> #ifdef CONFIG_X86_32 u8 apicid_2_node[MAX_APICID]; @@ -240,7 +243,10 @@ static void __cpuinit smp_callin(void) end_local_APIC_setup(); map_cpu_to_logical_apicid(); - notify_cpu_starting(cpuid); + /* + * Need to setup vector mappings before we enable interrupts. + */ + setup_vector_irq(smp_processor_id()); /* * Get our bogomips. * @@ -257,6 +263,8 @@ static void __cpuinit smp_callin(void) */ smp_store_cpu_info(cpuid); + notify_cpu_starting(cpuid); + /* * Allow the master to continue. */ @@ -286,9 +294,9 @@ notrace static void __cpuinit start_secondary(void *unused) check_tsc_sync_target(); if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); + legacy_pic->chip->mask(0); enable_NMI_through_LVT0(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } #ifdef CONFIG_X86_32 @@ -315,15 +323,18 @@ notrace static void __cpuinit start_secondary(void *unused) */ ipi_call_lock(); lock_vector_lock(); - __setup_vector_irq(smp_processor_id()); set_cpu_online(smp_processor_id(), true); unlock_vector_lock(); ipi_call_unlock(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + x86_platform.nmi_init(); /* enable local interrupts */ local_irq_enable(); + /* to prevent fake stack check failure in clock setup */ + boot_init_stack_canary(); + x86_cpuinit.setup_percpu_clockev(); wmb(); @@ -1083,9 +1094,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_cpu_sibling_map(0); enable_IR_x2apic(); -#ifdef CONFIG_X86_64 default_setup_apic_routing(); -#endif if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); @@ -1213,11 +1222,12 @@ __init void prefill_possible_map(void) total_cpus = max_t(int, possible, num_processors + disabled_cpus); - if (possible > CONFIG_NR_CPUS) { + /* nr_cpu_ids could be reduced via nr_cpus= */ + if (possible > nr_cpu_ids) { printk(KERN_WARNING "%d Processors exceeds NR_CPUS limit of %d\n", - possible, CONFIG_NR_CPUS); - possible = CONFIG_NR_CPUS; + possible, nr_cpu_ids); + possible = nr_cpu_ids; } printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index dee1ff7cba5..196552bb412 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -25,191 +25,6 @@ #include <asm/syscalls.h> /* - * Perform the select(nd, in, out, ex, tv) and mmap() system - * calls. Linux/i386 didn't use to be able to handle more than - * 4 system call parameters, so these system calls used a memory - * block for parameter passing.. - */ - -struct mmap_arg_struct { - unsigned long addr; - unsigned long len; - unsigned long prot; - unsigned long flags; - unsigned long fd; - unsigned long offset; -}; - -asmlinkage int old_mmap(struct mmap_arg_struct __user *arg) -{ - struct mmap_arg_struct a; - int err = -EFAULT; - - if (copy_from_user(&a, arg, sizeof(a))) - goto out; - - err = -EINVAL; - if (a.offset & ~PAGE_MASK) - goto out; - - err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, - a.fd, a.offset >> PAGE_SHIFT); -out: - return err; -} - - -struct sel_arg_struct { - unsigned long n; - fd_set __user *inp, *outp, *exp; - struct timeval __user *tvp; -}; - -asmlinkage int old_select(struct sel_arg_struct __user *arg) -{ - struct sel_arg_struct a; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - /* sys_select() does the appropriate kernel locking */ - return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp); -} - -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -asmlinkage int sys_ipc(uint call, int first, int second, - int third, void __user *ptr, long fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - switch (call) { - case SEMOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL); - case SEMTIMEDOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, second, - (const struct timespec __user *)fifth); - - case SEMGET: - return sys_semget(first, second, third); - case SEMCTL: { - union semun fourth; - if (!ptr) - return -EINVAL; - if (get_user(fourth.__pad, (void __user * __user *) ptr)) - return -EFAULT; - return sys_semctl(first, second, third, fourth); - } - - case MSGSND: - return sys_msgsnd(first, (struct msgbuf __user *) ptr, - second, third); - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - if (!ptr) - return -EINVAL; - - if (copy_from_user(&tmp, - (struct ipc_kludge __user *) ptr, - sizeof(tmp))) - return -EFAULT; - return sys_msgrcv(first, tmp.msgp, second, - tmp.msgtyp, third); - } - default: - return sys_msgrcv(first, - (struct msgbuf __user *) ptr, - second, fifth, third); - } - case MSGGET: - return sys_msgget((key_t) first, second); - case MSGCTL: - return sys_msgctl(first, second, (struct msqid_ds __user *) ptr); - - case SHMAT: - switch (version) { - default: { - ulong raddr; - ret = do_shmat(first, (char __user *) ptr, second, &raddr); - if (ret) - return ret; - return put_user(raddr, (ulong __user *) third); - } - case 1: /* iBCS2 emulator entry point */ - if (!segment_eq(get_fs(), get_ds())) - return -EINVAL; - /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */ - return do_shmat(first, (char __user *) ptr, second, (ulong *) third); - } - case SHMDT: - return sys_shmdt((char __user *)ptr); - case SHMGET: - return sys_shmget(first, second, third); - case SHMCTL: - return sys_shmctl(first, second, - (struct shmid_ds __user *) ptr); - default: - return -ENOSYS; - } -} - -/* - * Old cruft - */ -asmlinkage int sys_uname(struct old_utsname __user *name) -{ - int err; - if (!name) - return -EFAULT; - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof(*name)); - up_read(&uts_sem); - return err? -EFAULT:0; -} - -asmlinkage int sys_olduname(struct oldold_utsname __user *name) -{ - int error; - - if (!name) - return -EFAULT; - if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) - return -EFAULT; - - down_read(&uts_sem); - - error = __copy_to_user(&name->sysname, &utsname()->sysname, - __OLD_UTS_LEN); - error |= __put_user(0, name->sysname + __OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename, &utsname()->nodename, - __OLD_UTS_LEN); - error |= __put_user(0, name->nodename + __OLD_UTS_LEN); - error |= __copy_to_user(&name->release, &utsname()->release, - __OLD_UTS_LEN); - error |= __put_user(0, name->release + __OLD_UTS_LEN); - error |= __copy_to_user(&name->version, &utsname()->version, - __OLD_UTS_LEN); - error |= __put_user(0, name->version + __OLD_UTS_LEN); - error |= __copy_to_user(&name->machine, &utsname()->machine, - __OLD_UTS_LEN); - error |= __put_user(0, name->machine + __OLD_UTS_LEN); - - up_read(&uts_sem); - - error = error ? -EFAULT : 0; - - return error; -} - - -/* * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 8aa2057efd1..ff14a5044ce 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -209,15 +209,3 @@ bottomup: return addr; } - - -SYSCALL_DEFINE1(uname, struct new_utsname __user *, name) -{ - int err; - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof(*name)); - up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) - err |= copy_to_user(&name->machine, "i686", 5); - return err ? -EFAULT : 0; -} diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 15228b5d3eb..8b372934121 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -81,7 +81,7 @@ ENTRY(sys_call_table) .long sys_settimeofday .long sys_getgroups16 /* 80 */ .long sys_setgroups16 - .long old_select + .long sys_old_select .long sys_symlink .long sys_lstat .long sys_readlink /* 85 */ @@ -89,7 +89,7 @@ ENTRY(sys_call_table) .long sys_swapon .long sys_reboot .long sys_old_readdir - .long old_mmap /* 90 */ + .long sys_old_mmap /* 90 */ .long sys_munmap .long sys_truncate .long sys_ftruncate diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index be2573448ed..fb5cc5e14cf 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -70,11 +70,11 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) * manually to deassert NMI lines for the watchdog if run * on an 82489DX-based system. */ - spin_lock(&i8259A_lock); + raw_spin_lock(&i8259A_lock); outb(0x0c, PIC_MASTER_OCW3); /* Ack the IRQ; AEOI will end it automatically. */ inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); + raw_spin_unlock(&i8259A_lock); } global_clock_event->event_handler(global_clock_event); diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 364d015efeb..17b03dd3a6b 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -9,6 +9,7 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/kernel.h> +#include <linux/slab.h> #include <asm/mmu_context.h> #include <asm/uv/uv.h> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 33399176512..1168e445418 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -534,6 +534,9 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) get_debugreg(dr6, 6); + /* Filter out all the reserved bits which are preset to 1 */ + dr6 &= ~DR6_RESERVED; + /* Catch kmemcheck conditions first of all! */ if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) return; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 597683aa5ba..9faf91ae184 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -50,7 +50,7 @@ u64 native_sched_clock(void) * unstable. We do this because unlike Time Of Day, * the scheduler clock tolerates small errors and it's * very important for it to be as fast as the platform - * can achive it. ) + * can achieve it. ) */ if (unlikely(tsc_disabled)) { /* No locking but a rare wrong value is not a big deal: */ @@ -740,7 +740,7 @@ static cycle_t __vsyscall_fn vread_tsc(void) } #endif -static void resume_tsc(void) +static void resume_tsc(struct clocksource *cs) { clocksource_tsc.cycle_last = 0; } @@ -806,7 +806,7 @@ static void __init check_system_tsc_reliable(void) unsigned long res_low, res_high; rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); - /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */ + /* Geode_LX - the OLPC CPU has a very reliable TSC */ if (res_low & RTSC_SUSP) tsc_clocksource_reliable = 1; #endif diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index ece73d8e324..1d40336b030 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/rbtree.h> +#include <linux/slab.h> #include <linux/irq.h> #include <asm/apic.h> diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c index 36afb98675a..309c70fb775 100644 --- a/arch/x86/kernel/uv_sysfs.c +++ b/arch/x86/kernel/uv_sysfs.c @@ -54,19 +54,19 @@ static int __init sgi_uv_sysfs_init(void) if (!sgi_uv_kobj) sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); if (!sgi_uv_kobj) { - printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); + printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n"); return -EINVAL; } ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); if (ret) { - printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); + printk(KERN_WARNING "sysfs_create_file partition_id failed\n"); return ret; } ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); if (ret) { - printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); + printk(KERN_WARNING "sysfs_create_file coherence_id failed\n"); return ret; } diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 3c84aa001c1..56e421bc379 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -19,6 +19,7 @@ * Copyright (c) Dimitri Sivanich */ #include <linux/clockchips.h> +#include <linux/slab.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> @@ -282,10 +283,21 @@ static int uv_rtc_unset_timer(int cpu, int force) /* * Read the RTC. + * + * Starting with HUB rev 2.0, the UV RTC register is replicated across all + * cachelines of it's own page. This allows faster simultaneous reads + * from a given socket. */ static cycle_t uv_read_rtc(struct clocksource *cs) { - return (cycle_t)uv_read_local_mmr(UVH_RTC); + unsigned long offset; + + if (uv_get_min_hub_revision_id() == 1) + offset = 0; + else + offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE; + + return (cycle_t)uv_read_local_mmr(UVH_RTC | offset); } /* diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 34a279a7471..e680ea52db9 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -49,11 +49,6 @@ extern int no_broadcast; char visws_board_type = -1; char visws_board_rev = -1; -int is_visws_box(void) -{ - return visws_board_type >= 0; -} - static void __init visws_time_init(void) { printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -242,6 +237,8 @@ void __init visws_early_detect(void) x86_init.irqs.pre_vector_init = visws_pre_intr_init; x86_init.irqs.trap_init = visws_trap_init; x86_init.timers.timer_init = visws_time_init; + x86_init.pci.init = pci_visws_init; + x86_init.pci.init_irq = x86_init_noop; /* * Install reboot quirks: @@ -508,7 +505,7 @@ static struct irq_chip cobalt_irq_type = { */ static unsigned int startup_piix4_master_irq(unsigned int irq) { - init_8259A(0); + legacy_pic->init(0); return startup_cobalt_irq(irq); } @@ -532,9 +529,6 @@ static struct irq_chip piix4_master_irq_type = { static struct irq_chip piix4_virtual_irq_type = { .name = "PIIX4-virtual", - .shutdown = disable_8259A_irq, - .enable = enable_8259A_irq, - .disable = disable_8259A_irq, }; @@ -559,7 +553,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) struct irq_desc *desc; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); /* Find out what's interrupting in the PIIX4 master 8259 */ outb(0x0c, 0x20); /* OCW3 Poll command */ @@ -596,7 +590,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) outb(0x60 + realirq, 0x20); } - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); desc = irq_to_desc(realirq); @@ -609,12 +603,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) handle_IRQ_event(realirq, desc->action); if (!(desc->status & IRQ_DISABLED)) - enable_8259A_irq(realirq); + legacy_pic->chip->unmask(realirq); return IRQ_HANDLED; out_unlock: - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); return IRQ_NONE; } @@ -628,6 +622,12 @@ static struct irqaction cascade_action = { .name = "cascade", }; +static inline void set_piix4_virtual_irq_type(void) +{ + piix4_virtual_irq_type.shutdown = i8259A_chip.mask; + piix4_virtual_irq_type.enable = i8259A_chip.unmask; + piix4_virtual_irq_type.disable = i8259A_chip.mask; +} void init_VISWS_APIC_irqs(void) { @@ -653,6 +653,7 @@ void init_VISWS_APIC_irqs(void) desc->chip = &piix4_master_irq_type; } else if (i < CO_IRQ_APIC0) { + set_piix4_virtual_irq_type(); desc->chip = &piix4_virtual_irq_type; } else if (IS_CO_APIC(i)) { diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index d430e4c3019..ce9fbacb752 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -28,11 +28,13 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/sched.h> +#include <linux/gfp.h> #include <asm/vmi.h> #include <asm/io.h> #include <asm/fixmap.h> #include <asm/apicdef.h> #include <asm/apic.h> +#include <asm/pgalloc.h> #include <asm/processor.h> #include <asm/timer.h> #include <asm/vmi_time.h> @@ -266,30 +268,6 @@ static void vmi_nop(void) { } -#ifdef CONFIG_HIGHPTE -static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) -{ - void *va = kmap_atomic(page, type); - - /* - * Internally, the VMI ROM must map virtual addresses to physical - * addresses for processing MMU updates. By the time MMU updates - * are issued, this information is typically already lost. - * Fortunately, the VMI provides a cache of mapping slots for active - * page tables. - * - * We use slot zero for the linear mapping of physical memory, and - * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1. - * - * args: SLOT VA COUNT PFN - */ - BUG_ON(type != KM_PTE0 && type != KM_PTE1); - vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page)); - - return va; -} -#endif - static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) { vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); @@ -640,6 +618,12 @@ static inline int __init activate_vmi(void) u64 reloc; const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; + /* + * Prevent page tables from being allocated in highmem, even if + * CONFIG_HIGHPTE is enabled. + */ + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + if (call_vrom_func(vmi_rom, vmi_init) != 0) { printk(KERN_ERR "VMI ROM failed to initialize!"); return 0; @@ -778,10 +762,6 @@ static inline int __init activate_vmi(void) /* Set linear is needed in all cases */ vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); -#ifdef CONFIG_HIGHPTE - if (vmi_ops.set_linear_mapping) - pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; -#endif /* * These MUST always be patched. Don't support indirect jumps diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 74c92bb194d..5e1ff66ecd7 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -79,11 +79,7 @@ unsigned long vmi_tsc_khz(void) static inline unsigned int vmi_get_timer_vector(void) { -#ifdef CONFIG_X86_IO_APIC - return FIRST_DEVICE_VECTOR; -#else - return FIRST_EXTERNAL_VECTOR; -#endif + return IRQ0_VECTOR; } /** vmi clockchip */ @@ -171,7 +167,7 @@ static int vmi_timer_next_event(unsigned long delta, { /* Unfortunately, set_next_event interface only passes relative * expiry, but we want absolute expiry. It'd be better if were - * were passed an aboslute expiry, since a bunch of time may + * were passed an absolute expiry, since a bunch of time may * have been stolen between the time the delta is computed and * when we set the alarm below. */ cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT)); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index f92a0da608c..2cc249718c4 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -291,8 +291,8 @@ SECTIONS .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { __smp_locks = .; *(.smp_locks) - __smp_locks_end = .; . = ALIGN(PAGE_SIZE); + __smp_locks_end = .; } #ifdef CONFIG_X86_64 @@ -341,7 +341,7 @@ SECTIONS * Per-cpu symbols which need to be offset from __per_cpu_load * for the boot processor. */ -#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load +#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load INIT_PER_CPU(gdt_page); INIT_PER_CPU(irq_stack_union); @@ -352,7 +352,7 @@ INIT_PER_CPU(irq_stack_union); "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -. = ASSERT((per_cpu__irq_stack_union == 0), +. = ASSERT((irq_stack_union == 0), "irq_stack_union is not at start of per-cpu area"); #endif diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 9055e5872ff..1c0c6ab9c60 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -301,7 +301,8 @@ static int __init vsyscall_init(void) register_sysctl_table(kernel_root_table2); #endif on_each_cpu(cpu_vsyscall_init, NULL, 1); - hotcpu_notifier(cpu_vsyscall_notifier, 0); + /* notifier priority > KVM */ + hotcpu_notifier(cpu_vsyscall_notifier, 30); return 0; } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ccd179dec36..61a1e8c7e19 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -4,9 +4,11 @@ * For licencing details see kernel-base/COPYING */ #include <linux/init.h> +#include <linux/ioport.h> #include <asm/bios_ebda.h> #include <asm/paravirt.h> +#include <asm/pci_x86.h> #include <asm/mpspec.h> #include <asm/setup.h> #include <asm/apic.h> @@ -70,16 +72,25 @@ struct x86_init_ops x86_init __initdata = { .iommu = { .iommu_init = iommu_init_noop, }, + + .pci = { + .init = x86_default_pci_init, + .init_irq = x86_default_pci_init_irq, + .fixup_irqs = x86_default_pci_fixup_irqs, + }, }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { .setup_percpu_clockev = setup_secondary_APIC_clock, }; +static void default_nmi_init(void) { }; + struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, .iommu_shutdown = iommu_shutdown_noop, .is_untracked_pat_range = is_ISA_range, + .nmi_init = default_nmi_init }; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index c5ee17e8c6d..782c3a362ec 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -337,6 +337,7 @@ void __ref xsave_cntxt_init(void) cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); xstate_size = ebx; + update_regset_xstate_info(xstate_size, pcntxt_mask); prepare_fx_sw_frame(); setup_xstate_init(); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 4cd49833246..970bbd47951 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -29,6 +29,7 @@ config KVM select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE select USER_RETURN_NOTIFIER + select KVM_MMIO ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent @@ -65,6 +66,7 @@ config KVM_AMD # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. +source drivers/vhost/Kconfig source drivers/lguest/Kconfig source drivers/virtio/Kconfig diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7e8faea4651..4dade6ac082 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -32,7 +32,7 @@ #include <linux/module.h> #include <asm/kvm_emulate.h> -#include "mmu.h" /* for is_long_mode() */ +#include "x86.h" /* * Opcode effective-address decode tables. @@ -76,6 +76,8 @@ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ /* Misc flags */ +#define Lock (1<<26) /* lock prefix is allowed for the instruction */ +#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) /* Source 2 operand type */ #define Src2None (0<<29) @@ -88,39 +90,40 @@ enum { Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, + Group8, Group9, }; static u32 opcode_table[256] = { /* 0x00 - 0x07 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x08 - 0x0F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, 0, /* 0x10 - 0x17 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x18 - 0x1F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x20 - 0x27 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x28 - 0x2F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 0, 0, 0, 0, /* 0x30 - 0x37 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 0, 0, 0, 0, /* 0x38 - 0x3F */ @@ -156,7 +159,7 @@ static u32 opcode_table[256] = { Group | Group1_80, Group | Group1_81, Group | Group1_82, Group | Group1_83, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -210,7 +213,7 @@ static u32 opcode_table[256] = { SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, - ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, + ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, @@ -218,16 +221,20 @@ static u32 opcode_table[256] = { static u32 twobyte_table[256] = { /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + 0, Group | GroupDual | Group7, 0, 0, + 0, ImplicitOps, ImplicitOps | Priv, 0, + ImplicitOps | Priv, ImplicitOps | Priv, 0, 0, + 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */ - ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, + ModRM | ImplicitOps | Priv, ModRM | Priv, + ModRM | ImplicitOps | Priv, ModRM | Priv, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, + ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, + ImplicitOps, ImplicitOps | Priv, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x47 */ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -257,21 +264,23 @@ static u32 twobyte_table[256] = { DstMem | SrcReg | Src2CL | ModRM, 0, 0, /* 0xA8 - 0xAF */ ImplicitOps | Stack, ImplicitOps | Stack, - 0, DstMem | SrcReg | ModRM | BitOp, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, DstMem | SrcReg | Src2ImmByte | ModRM, DstMem | SrcReg | Src2CL | ModRM, ModRM, 0, /* 0xB0 - 0xB7 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, - DstMem | SrcReg | ModRM | BitOp, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xB8 - 0xBF */ - 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, + 0, 0, + Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, + 0, 0, 0, Group | GroupDual | Group9, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -283,25 +292,41 @@ static u32 twobyte_table[256] = { static u32 group_table[] = { [Group1_80*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM, [Group1_81*8] = - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM, [Group1_82*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64, [Group1_83*8] = - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM, [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, [Group3_Byte*8] = @@ -320,24 +345,39 @@ static u32 group_table[] = { SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, [Group7*8] = - 0, 0, ModRM | SrcMem, ModRM | SrcMem, + 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, + SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, + [Group8*8] = + 0, 0, 0, 0, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, + [Group9*8] = + 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0, }; static u32 group2_table[] = { [Group7*8] = - SrcNone | ModRM, 0, 0, SrcNone | ModRM, + SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM, SrcNone | ModRM | DstMem | Mov, 0, SrcMem16 | ModRM | Mov, 0, + [Group9*8] = + 0, 0, 0, 0, 0, 0, 0, 0, }; /* EFLAGS bit definitions. */ +#define EFLG_ID (1<<21) +#define EFLG_VIP (1<<20) +#define EFLG_VIF (1<<19) +#define EFLG_AC (1<<18) #define EFLG_VM (1<<17) #define EFLG_RF (1<<16) +#define EFLG_IOPL (3<<12) +#define EFLG_NT (1<<14) #define EFLG_OF (1<<11) #define EFLG_DF (1<<10) #define EFLG_IF (1<<9) +#define EFLG_TF (1<<8) #define EFLG_SF (1<<7) #define EFLG_ZF (1<<6) #define EFLG_AF (1<<4) @@ -606,7 +646,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, if (linear < fc->start || linear >= fc->end) { size = min(15UL, PAGE_SIZE - offset_in_page(linear)); - rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); + rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL); if (rc) return rc; fc->start = linear; @@ -661,11 +701,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, op_bytes = 3; *address = 0; rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu); + ctxt->vcpu, NULL); if (rc) return rc; rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu); + ctxt->vcpu, NULL); return rc; } @@ -889,6 +929,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) switch (mode) { case X86EMUL_MODE_REAL: + case X86EMUL_MODE_VM86: case X86EMUL_MODE_PROT16: def_op_bytes = def_ad_bytes = 2; break; @@ -975,7 +1016,7 @@ done_prefixes: } if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { - kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");; + kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction"); return -1; } @@ -1196,13 +1237,56 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, rc = ops->read_emulated(register_address(c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]), dest, len, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); return rc; } +static int emulate_popf(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *dest, int len) +{ + int rc; + unsigned long val, change_mask; + int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu); + + rc = emulate_pop(ctxt, ops, &val, len); + if (rc != X86EMUL_CONTINUE) + return rc; + + change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF + | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; + + switch(ctxt->mode) { + case X86EMUL_MODE_PROT64: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT16: + if (cpl == 0) + change_mask |= EFLG_IOPL; + if (cpl <= iopl) + change_mask |= EFLG_IF; + break; + case X86EMUL_MODE_VM86: + if (iopl < 3) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } + change_mask |= EFLG_IF; + break; + default: /* real mode */ + change_mask |= (EFLG_IOPL | EFLG_IF); + break; + } + + *(unsigned long *)dest = + (ctxt->eflags & ~change_mask) | (val & change_mask); + + return rc; +} + static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) { struct decode_cache *c = &ctxt->decode; @@ -1225,7 +1309,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, if (rc != 0) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg); + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg); return rc; } @@ -1370,7 +1454,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, int rc; rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || @@ -1385,7 +1469,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, (u32) c->regs[VCPU_REGS_RBX]; rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; ctxt->eflags |= EFLG_ZF; } @@ -1407,7 +1491,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); if (rc) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS); return rc; } @@ -1451,7 +1535,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, &c->dst.val, c->dst.bytes, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; break; case OP_NONE: @@ -1514,9 +1598,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) u64 msr_data; /* syscall is not available in real mode */ - if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) - return -1; + if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) + return X86EMUL_UNHANDLEABLE; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1553,7 +1636,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); } - return 0; + return X86EMUL_CONTINUE; } static int @@ -1563,22 +1646,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) struct kvm_segment cs, ss; u64 msr_data; - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL || - !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + /* inject #GP if in real mode */ + if (ctxt->mode == X86EMUL_MODE_REAL) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_UNHANDLEABLE; } /* XXX sysenter/sysexit have not been tested in 64bit mode. * Therefore, we inject an #UD. */ if (ctxt->mode == X86EMUL_MODE_PROT64) - return -1; + return X86EMUL_UNHANDLEABLE; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1587,13 +1665,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } break; case X86EMUL_MODE_PROT64: if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } break; } @@ -1618,7 +1696,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; - return 0; + return X86EMUL_CONTINUE; } static int @@ -1629,21 +1707,11 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) u64 msr_data; int usermode; - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - - /* sysexit must be called from CPL 0 */ - if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { + /* inject #GP if in real mode or Virtual 8086 mode */ + if (ctxt->mode == X86EMUL_MODE_REAL || + ctxt->mode == X86EMUL_MODE_VM86) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_UNHANDLEABLE; } setup_syscalls_segments(ctxt, &cs, &ss); @@ -1661,7 +1729,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.selector = (u16)(msr_data + 16); if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } ss.selector = (u16)(msr_data + 24); break; @@ -1669,7 +1737,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.selector = (u16)(msr_data + 32); if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } ss.selector = cs.selector + 8; cs.db = 0; @@ -1685,7 +1753,58 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; - return 0; + return X86EMUL_CONTINUE; +} + +static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) +{ + int iopl; + if (ctxt->mode == X86EMUL_MODE_REAL) + return false; + if (ctxt->mode == X86EMUL_MODE_VM86) + return true; + iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl; +} + +static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + struct kvm_segment tr_seg; + int r; + u16 io_bitmap_ptr; + u8 perm, bit_idx = port & 0x7; + unsigned mask = (1 << len) - 1; + + kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); + if (tr_seg.unusable) + return false; + if (tr_seg.limit < 103) + return false; + r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, + NULL); + if (r != X86EMUL_CONTINUE) + return false; + if (io_bitmap_ptr + port/8 > tr_seg.limit) + return false; + r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, + ctxt->vcpu, NULL); + if (r != X86EMUL_CONTINUE) + return false; + if ((perm >> bit_idx) & mask) + return false; + return true; +} + +static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + if (emulator_bad_iopl(ctxt)) + if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) + return false; + return true; } int @@ -1709,6 +1828,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); saved_eip = c->eip; + /* LOCK prefix is allowed only with some instructions */ + if (c->lock_prefix && !(c->d & Lock)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + + /* Privileged instruction can be executed only in CPL=0 */ + if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) memop = c->modrm_ea; @@ -1749,7 +1880,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) &c->src.val, c->src.bytes, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val = c->src.val; } @@ -1768,12 +1899,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) c->dst.ptr = (void *)c->dst.ptr + (c->src.val & mask) / 8; } - if (!(c->d & Mov) && - /* optimisation - avoid slow emulated read */ - ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0)) - goto done; + if (!(c->d & Mov)) { + /* optimisation - avoid slow emulated read */ + rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + goto done; + } } c->dst.orig_val = c->dst.val; @@ -1876,7 +2010,12 @@ special_insn: break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ - if (kvm_emulate_pio_string(ctxt->vcpu, + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio_string(ctxt->vcpu, 1, (c->d & ByteOp) ? 1 : c->op_bytes, c->rep_prefix ? @@ -1892,6 +2031,11 @@ special_insn: return 0; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } if (kvm_emulate_pio_string(ctxt->vcpu, 0, (c->d & ByteOp) ? 1 : c->op_bytes, @@ -1978,25 +2122,19 @@ special_insn: break; case 0x8e: { /* mov seg, r/m16 */ uint16_t sel; - int type_bits; - int err; sel = c->src.val; - if (c->modrm_reg == VCPU_SREG_SS) - toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); - if (c->modrm_reg <= 5) { - type_bits = (c->modrm_reg == 1) ? 9 : 1; - err = kvm_load_segment_descriptor(ctxt->vcpu, sel, - type_bits, c->modrm_reg); - } else { - printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", - c->modrm); - goto cannot_emulate; + if (c->modrm_reg == VCPU_SREG_CS || + c->modrm_reg > VCPU_SREG_GS) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; } - if (err < 0) - goto cannot_emulate; + if (c->modrm_reg == VCPU_SREG_SS) + toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); + + rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg); c->dst.type = OP_NONE; /* Disable writeback. */ break; @@ -2025,7 +2163,10 @@ special_insn: c->dst.type = OP_REG; c->dst.ptr = (unsigned long *) &ctxt->eflags; c->dst.bytes = c->op_bytes; - goto pop_instruction; + rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0xa0 ... 0xa1: /* mov */ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; c->dst.val = c->src.val; @@ -2039,11 +2180,12 @@ special_insn: c->dst.ptr = (unsigned long *)register_address(c, es_base(ctxt), c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), + rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0) + c->dst.bytes, ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; register_address_increment(c, &c->regs[VCPU_REGS_RSI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes @@ -2058,10 +2200,11 @@ special_insn: c->src.ptr = (unsigned long *)register_address(c, seg_override_base(ctxt, c), c->regs[VCPU_REGS_RSI]); - if ((rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated((unsigned long)c->src.ptr, + &c->src.val, + c->src.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; c->dst.type = OP_NONE; /* Disable writeback. */ @@ -2069,10 +2212,11 @@ special_insn: c->dst.ptr = (unsigned long *)register_address(c, es_base(ctxt), c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); @@ -2102,12 +2246,13 @@ special_insn: c->dst.type = OP_REG; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; register_address_increment(c, &c->regs[VCPU_REGS_RSI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes @@ -2163,11 +2308,9 @@ special_insn: case 0xe9: /* jmp rel */ goto jmp; case 0xea: /* jmp far */ - if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, - VCPU_SREG_CS) < 0) { - DPRINTF("jmp far: Failed to load CS descriptor\n"); - goto cannot_emulate; - } + if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, + VCPU_SREG_CS)) + goto done; c->eip = c->src.val; break; @@ -2185,7 +2328,13 @@ special_insn: case 0xef: /* out (e/r)ax,dx */ port = c->regs[VCPU_REGS_RDX]; io_dir_in = 0; - do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, + do_io: + if (!emulator_io_permited(ctxt, ops, port, + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, (c->d & ByteOp) ? 1 : c->op_bytes, port) != 0) { c->eip = saved_eip; @@ -2210,13 +2359,21 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfa: /* cli */ - ctxt->eflags &= ~X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + ctxt->eflags &= ~X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfb: /* sti */ - toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); - ctxt->eflags |= X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); + ctxt->eflags |= X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfc: /* cld */ ctxt->eflags &= ~EFLG_DF; @@ -2319,8 +2476,9 @@ twobyte_insn: } break; case 0x05: /* syscall */ - if (emulate_syscall(ctxt) == -1) - goto cannot_emulate; + rc = emulate_syscall(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; @@ -2391,14 +2549,16 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x34: /* sysenter */ - if (emulate_sysenter(ctxt) == -1) - goto cannot_emulate; + rc = emulate_sysenter(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; case 0x35: /* sysexit */ - if (emulate_sysexit(ctxt) == -1) - goto cannot_emulate; + rc = emulate_sysexit(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 296aba49472..0150affad25 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -32,6 +32,7 @@ #define pr_fmt(fmt) "pit: " fmt #include <linux/kvm_host.h> +#include <linux/slab.h> #include "irq.h" #include "i8254.h" @@ -242,11 +243,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); - spin_lock(&ps->inject_lock); + raw_spin_lock(&ps->inject_lock); if (atomic_dec_return(&ps->pit_timer.pending) < 0) atomic_inc(&ps->pit_timer.pending); ps->irq_ack = 1; - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -467,6 +468,9 @@ static int pit_ioport_read(struct kvm_io_device *this, return -EOPNOTSUPP; addr &= KVM_PIT_CHANNEL_MASK; + if (addr == 3) + return 0; + s = &pit_state->channels[addr]; mutex_lock(&pit_state->lock); @@ -602,7 +606,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = { .write = speaker_ioport_write, }; -/* Caller must have writers lock on slots_lock */ +/* Caller must hold slots_lock */ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; @@ -621,7 +625,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); - spin_lock_init(&pit->pit_state.inject_lock); + raw_spin_lock_init(&pit->pit_state.inject_lock); kvm->arch.vpit = pit; pit->kvm = kvm; @@ -642,13 +646,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_iodevice_init(&pit->dev, &pit_dev_ops); - ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev); if (ret < 0) goto fail; if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); - ret = __kvm_io_bus_register_dev(&kvm->pio_bus, + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); if (ret < 0) goto fail_unregister; @@ -657,11 +661,12 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) return pit; fail_unregister: - __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); fail: - if (pit->irq_source_id >= 0) - kvm_free_irq_source_id(kvm, pit->irq_source_id); + kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); + kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); return NULL; @@ -720,12 +725,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) /* Try to inject pending interrupts when * last one has been acked. */ - spin_lock(&ps->inject_lock); + raw_spin_lock(&ps->inject_lock); if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { ps->irq_ack = 0; inject = 1; } - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); if (inject) __inject_pit_timer_intr(kvm); } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index d4c1c7ffdc0..900d6b0ba7c 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -27,7 +27,7 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - spinlock_t inject_lock; + raw_spinlock_t inject_lock; unsigned long irq_ack; struct kvm_irq_ack_notifier irq_ack_notifier; }; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index d057c0cbd24..a790fa128a9 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -26,6 +26,7 @@ * Port from Qemu. */ #include <linux/mm.h> +#include <linux/slab.h> #include <linux/bitops.h> #include "irq.h" @@ -44,18 +45,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) * Other interrupt may be delivered to PIC while lock is dropped but * it should be safe since PIC state is already updated at this stage. */ - spin_unlock(&s->pics_state->lock); + raw_spin_unlock(&s->pics_state->lock); kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); - spin_lock(&s->pics_state->lock); + raw_spin_lock(&s->pics_state->lock); } void kvm_pic_clear_isr_ack(struct kvm *kvm) { struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + + raw_spin_lock(&s->lock); s->pics[0].isr_ack = 0xff; s->pics[1].isr_ack = 0xff; - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); } /* @@ -156,9 +158,9 @@ static void pic_update_irq(struct kvm_pic *s) void kvm_pic_update_irq(struct kvm_pic *s) { - spin_lock(&s->lock); + raw_spin_lock(&s->lock); pic_update_irq(s); - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); } int kvm_pic_set_irq(void *opaque, int irq, int level) @@ -166,14 +168,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) struct kvm_pic *s = opaque; int ret = -1; - spin_lock(&s->lock); + raw_spin_lock(&s->lock); if (irq >= 0 && irq < PIC_NUM_PINS) { ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); } - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return ret; } @@ -203,7 +205,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + raw_spin_lock(&s->lock); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -228,7 +230,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return intno; } @@ -442,7 +444,7 @@ static int picdev_write(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte write\n"); return 0; } - spin_lock(&s->lock); + raw_spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -455,7 +457,7 @@ static int picdev_write(struct kvm_io_device *this, elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return 0; } @@ -472,7 +474,7 @@ static int picdev_read(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte read\n"); return 0; } - spin_lock(&s->lock); + raw_spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -486,7 +488,7 @@ static int picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return 0; } @@ -520,7 +522,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; - spin_lock_init(&s->lock); + raw_spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; @@ -533,7 +535,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) * Initialize PIO device */ kvm_iodevice_init(&s->dev, &picdev_ops); - ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); + mutex_unlock(&kvm->slots_lock); if (ret < 0) { kfree(s); return NULL; @@ -541,3 +545,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) return s; } + +void kvm_destroy_pic(struct kvm *kvm) +{ + struct kvm_pic *vpic = kvm->arch.vpic; + + if (vpic) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); + kvm->arch.vpic = NULL; + kfree(vpic); + } +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index be399e207d5..34b15915754 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -62,7 +62,7 @@ struct kvm_kpic_state { }; struct kvm_pic { - spinlock_t lock; + raw_spinlock_t lock; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ @@ -75,6 +75,7 @@ struct kvm_pic { }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); +void kvm_destroy_pic(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); void kvm_pic_clear_isr_ack(struct kvm *kvm); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 7bcc5b6a440..cff851cf532 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -1,6 +1,11 @@ #ifndef ASM_KVM_CACHE_REGS_H #define ASM_KVM_CACHE_REGS_H +#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS +#define KVM_POSSIBLE_CR4_GUEST_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE) + static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, enum kvm_reg reg) { @@ -38,4 +43,30 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) return vcpu->arch.pdptrs[index]; } +static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; + if (tmask & vcpu->arch.cr0_guest_owned_bits) + kvm_x86_ops->decache_cr0_guest_bits(vcpu); + return vcpu->arch.cr0 & mask; +} + +static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, ~0UL); +} + +static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; + if (tmask & vcpu->arch.cr4_guest_owned_bits) + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + return vcpu->arch.cr4 & mask; +} + +static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, ~0UL); +} + #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cd60c0bd1b3..1eb7a4ae0c9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -26,6 +26,7 @@ #include <linux/io.h> #include <linux/module.h> #include <linux/math64.h> +#include <linux/slab.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/page.h> @@ -373,6 +374,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (unlikely(!apic_enabled(apic))) break; + if (trig_mode) { + apic_debug("level trig mode for vector %d", vector); + apic_set_vector(vector, apic->regs + APIC_TMR); + } else + apic_clear_vector(vector, apic->regs + APIC_TMR); + result = !apic_test_and_set_irr(vector, apic); trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector, !result); @@ -383,11 +390,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; } - if (trig_mode) { - apic_debug("level trig mode for vector %d", vector); - apic_set_vector(vector, apic->regs + APIC_TMR); - } else - apic_clear_vector(vector, apic->regs + APIC_TMR); kvm_vcpu_kick(vcpu); break; @@ -1150,6 +1152,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) hrtimer_cancel(&apic->lapic_timer.timer); update_divide_count(apic); start_apic_timer(apic); + apic->irr_pending = true; } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) @@ -1244,3 +1247,34 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) return 0; } + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 1; + + /* if this is ICR write vector before command */ + if (reg == APIC_ICR) + apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return apic_reg_write(apic, reg, (u32)data); +} + +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 low, high = 0; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 1; + + if (apic_reg_read(apic, reg, 4, &low)) + return 1; + if (reg == APIC_ICR) + apic_reg_read(apic, APIC_ICR2, 4, &high); + + *data = (((u64)high) << 32) | low; + + return 0; +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 40010b09c4a..f5fe32c5eda 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -48,4 +48,12 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; +} #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4c3e5b2314c..19a8906bcaa 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -18,6 +18,7 @@ */ #include "mmu.h" +#include "x86.h" #include "kvm_cache_regs.h" #include <linux/kvm_host.h> @@ -29,6 +30,8 @@ #include <linux/swap.h> #include <linux/hugetlb.h> #include <linux/compiler.h> +#include <linux/srcu.h> +#include <linux/slab.h> #include <asm/page.h> #include <asm/cmpxchg.h> @@ -136,16 +139,6 @@ module_param(oos_shadow, bool, 0644); #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) - -#define PT_PDPE_LEVEL 3 -#define PT_DIRECTORY_LEVEL 2 -#define PT_PAGE_TABLE_LEVEL 1 - #define RMAP_EXT 4 #define ACC_EXEC_MASK 1 @@ -153,6 +146,9 @@ module_param(oos_shadow, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +#include <trace/events/kvm.h> + +#undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS #include "mmutrace.h" @@ -229,7 +225,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); static int is_write_protection(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr0 & X86_CR0_WP; + return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } static int is_cpuid_PSE36(void) @@ -239,7 +235,7 @@ static int is_cpuid_PSE36(void) static int is_nx(struct kvm_vcpu *vcpu) { - return vcpu->arch.shadow_efer & EFER_NX; + return vcpu->arch.efer & EFER_NX; } static int is_shadow_present_pte(u64 pte) @@ -253,7 +249,7 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_writeble_pte(unsigned long pte) +static int is_writable_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; } @@ -470,24 +466,10 @@ static int has_wrprotected_page(struct kvm *kvm, static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { - unsigned long page_size = PAGE_SIZE; - struct vm_area_struct *vma; - unsigned long addr; + unsigned long page_size; int i, ret = 0; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return page_size; - - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, addr); - if (!vma) - goto out; - - page_size = vma_kernel_pagesize(vma); - -out: - up_read(¤t->mm->mmap_sem); + page_size = kvm_host_page_size(kvm, gfn); for (i = PT_PAGE_TABLE_LEVEL; i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { @@ -503,8 +485,7 @@ out: static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; - int host_level; - int level = PT_PAGE_TABLE_LEVEL; + int host_level, level, max_level; slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) @@ -515,11 +496,12 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) if (host_level == PT_PAGE_TABLE_LEVEL) return host_level; - for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { + max_level = kvm_x86_ops->get_lpage_level() < host_level ? + kvm_x86_ops->get_lpage_level() : host_level; + for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) break; - } return level - 1; } @@ -635,7 +617,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) pfn = spte_to_pfn(*spte); if (*spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (is_writeble_pte(*spte)) + if (is_writable_pte(*spte)) kvm_set_pfn_dirty(pfn); rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); if (!*rmapp) { @@ -664,6 +646,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) prev_desc = desc; desc = desc->more; } + pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); BUG(); } } @@ -710,7 +693,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!spte); BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - if (is_writeble_pte(*spte)) { + if (is_writable_pte(*spte)) { __set_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } @@ -734,7 +717,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); - if (is_writeble_pte(*spte)) { + if (is_writable_pte(*spte)) { rmap_remove(kvm, spte); --kvm->stat.lpages; __set_spte(spte, shadow_trap_nonpresent_pte); @@ -789,7 +772,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; - if (is_writeble_pte(*spte)) + if (is_writable_pte(*spte)) kvm_set_pfn_dirty(spte_to_pfn(*spte)); __set_spte(spte, new_spte); spte = rmap_next(kvm, rmapp, spte); @@ -807,35 +790,32 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, unsigned long data)) { int i, j; + int ret; int retval = 0; + struct kvm_memslots *slots; - /* - * If mmap_sem isn't taken, we can look the memslots with only - * the mmu_lock by skipping over the slots with userspace_addr == 0. - */ - for (i = 0; i < kvm->nmemslots; i++) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; + slots = rcu_dereference(kvm->memslots); + + for (i = 0; i < slots->nmemslots; i++) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; unsigned long start = memslot->userspace_addr; unsigned long end; - /* mmu_lock protects userspace_addr */ - if (!start) - continue; - end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; - retval |= handler(kvm, &memslot->rmap[gfn_offset], - data); + ret = handler(kvm, &memslot->rmap[gfn_offset], data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { int idx = gfn_offset; idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); - retval |= handler(kvm, + ret |= handler(kvm, &memslot->lpage_info[j][idx].rmap_pde, data); } + trace_kvm_age_page(hva, memslot, ret); + retval |= ret; } } @@ -858,9 +838,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 *spte; int young = 0; - /* always return old for EPT */ + /* + * Emulate the accessed bit for EPT, by checking if this page has + * an EPT mapping, and clearing it if it does. On the next access, + * a new EPT mapping will be established. + * This has some overhead, but not as much as the cost of swapping + * out actively used pages or breaking up actively used hugepages. + */ if (!shadow_accessed_mask) - return 0; + return kvm_unmap_rmapp(kvm, rmapp, data); spte = rmap_next(kvm, rmapp, NULL); while (spte) { @@ -1504,8 +1490,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm, for_each_sp(pages, sp, parents, i) { kvm_mmu_zap_page(kvm, sp); mmu_pages_clear_parents(&parents); + zapped++; } - zapped += pages.nr; kvm_mmu_pages_init(parent, &parents, &pages); } @@ -1556,14 +1542,16 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) */ if (used_pages > kvm_nr_mmu_pages) { - while (used_pages > kvm_nr_mmu_pages) { + while (used_pages > kvm_nr_mmu_pages && + !list_empty(&kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - kvm_mmu_zap_page(kvm, page); + used_pages -= kvm_mmu_zap_page(kvm, page); used_pages--; } + kvm_nr_mmu_pages = used_pages; kvm->arch.n_free_mmu_pages = 0; } else @@ -1610,14 +1598,15 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) && !sp->role.invalid) { pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); + if (kvm_mmu_zap_page(kvm, sp)) + nn = bucket->first; } } } static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) { - int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); + int slot = memslot_id(kvm, gfn); struct kvm_mmu_page *sp = page_header(__pa(pte)); __set_bit(slot, sp->slot_bitmap); @@ -1641,7 +1630,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) { struct page *page; - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); if (gpa == UNMAPPED_GVA) return NULL; @@ -1854,7 +1843,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * is responsibility of mmu_get_page / kvm_sync_page. * Same reasoning can be applied to dirty page accounting. */ - if (!can_unsync && is_writeble_pte(*sptep)) + if (!can_unsync && is_writable_pte(*sptep)) goto set_pte; if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { @@ -1862,7 +1851,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writeble_pte(spte)) + if (is_writable_pte(spte)) spte &= ~PT_WRITABLE_MASK; } } @@ -1883,7 +1872,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, bool reset_host_protection) { int was_rmapped = 0; - int was_writeble = is_writeble_pte(*sptep); + int was_writable = is_writable_pte(*sptep); int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" @@ -1934,7 +1923,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); } else { - if (was_writeble) + if (was_writable) kvm_release_pfn_dirty(pfn); else kvm_release_pfn_clean(pfn); @@ -2164,8 +2153,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, u32 *error) { + if (error) + *error = 0; return vaddr; } @@ -2749,7 +2741,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) if (tdp_enabled) return 0; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); @@ -2849,16 +2841,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) */ page = alloc_page(GFP_KERNEL | __GFP_DMA32); if (!page) - goto error_1; + return -ENOMEM; + vcpu->arch.mmu.pae_root = page_address(page); for (i = 0; i < 4; ++i) vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; return 0; - -error_1: - free_mmu_pages(vcpu); - return -ENOMEM; } int kvm_mmu_create(struct kvm_vcpu *vcpu) @@ -2938,10 +2927,9 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - int npages; + int npages, idx; - if (!down_read_trylock(&kvm->slots_lock)) - continue; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); npages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; @@ -2954,7 +2942,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) nr_to_scan--; spin_unlock(&kvm->mmu_lock); - up_read(&kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, idx); } if (kvm_freed) list_move_tail(&kvm_freed->vm_list, &vm_list); @@ -3021,9 +3009,11 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) int i; unsigned int nr_mmu_pages; unsigned int nr_pages = 0; + struct kvm_memslots *slots; - for (i = 0; i < kvm->nmemslots; i++) - nr_pages += kvm->memslots[i].npages; + slots = rcu_dereference(kvm->memslots); + for (i = 0; i < slots->nmemslots; i++) + nr_pages += slots->memslots[i].npages; nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, @@ -3248,7 +3238,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) audit_mappings_page(vcpu, ent, va, level - 1); else { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); gfn_t gfn = gpa >> PAGE_SHIFT; pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; @@ -3293,10 +3283,12 @@ static void audit_mappings(struct kvm_vcpu *vcpu) static int count_rmaps(struct kvm_vcpu *vcpu) { int nmaps = 0; - int i, j, k; + int i, j, k, idx; + idx = srcu_read_lock(&kvm->srcu); + slots = rcu_dereference(kvm->memslots); for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; + struct kvm_memory_slot *m = &slots->memslots[i]; struct kvm_rmap_desc *d; for (j = 0; j < m->npages; ++j) { @@ -3319,6 +3311,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu) } } } + srcu_read_unlock(&kvm->srcu, idx); return nmaps; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 61a1b3884b4..be66759321a 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -2,6 +2,7 @@ #define __KVM_X86_MMU_H #include <linux/kvm_host.h> +#include "kvm_cache_regs.h" #define PT64_PT_BITS 9 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) @@ -37,6 +38,16 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 +#define PT_PDPE_LEVEL 3 +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) +#define PFERR_FETCH_MASK (1U << 4) + int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) @@ -53,30 +64,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) return kvm_mmu_load(vcpu); } -static inline int is_long_mode(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - return vcpu->arch.shadow_efer & EFER_LMA; -#else - return 0; -#endif -} - -static inline int is_pae(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr4 & X86_CR4_PAE; -} - -static inline int is_pse(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr4 & X86_CR4_PSE; -} - -static inline int is_paging(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr0 & X86_CR0_PG; -} - static inline int is_present_gpte(unsigned long pte) { return pte & PT_PRESENT_MASK; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a6017132fba..81eab9a50e6 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -150,7 +150,9 @@ walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); + if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) + goto not_present; + trace_kvm_mmu_paging_element(pte, walker->level); if (!is_present_gpte(pte)) @@ -160,7 +162,7 @@ walk: if (rsvd_fault) goto access_error; - if (write_fault && !is_writeble_pte(pte)) + if (write_fault && !is_writable_pte(pte)) if (user_fault || is_write_protection(vcpu)) goto access_error; @@ -455,8 +457,6 @@ out_unlock: static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_shadow_walk_iterator iterator; - pt_element_t gpte; - gpa_t pte_gpa = -1; int level; u64 *sptep; int need_flush = 0; @@ -470,10 +470,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (level == PT_PAGE_TABLE_LEVEL || ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); - - pte_gpa = (sp->gfn << PAGE_SHIFT); - pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); if (is_shadow_present_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); @@ -492,32 +488,25 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (need_flush) kvm_flush_remote_tlbs(vcpu->kvm); spin_unlock(&vcpu->kvm->mmu_lock); - - if (pte_gpa == -1) - return; - if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, - sizeof(pt_element_t))) - return; - if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) { - if (mmu_topup_memory_caches(vcpu)) - return; - kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, - sizeof(pt_element_t), 0); - } } -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, + u32 *error) { struct guest_walker walker; gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); + r = FNAME(walk_addr)(&walker, vcpu, vaddr, + !!(access & PFERR_WRITE_MASK), + !!(access & PFERR_USER_MASK), + !!(access & PFERR_FETCH_MASK)); if (r) { gpa = gfn_to_gpa(walker.gfn); gpa |= vaddr & ~PAGE_MASK; - } + } else if (error) + *error = walker.error_code; return gpa; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1d9b33843c8..2ba58206812 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -26,6 +26,7 @@ #include <linux/highmem.h> #include <linux/sched.h> #include <linux/ftrace_event.h> +#include <linux/slab.h> #include <asm/desc.h> @@ -231,7 +232,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) efer &= ~EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; - vcpu->arch.shadow_efer = efer; + vcpu->arch.efer = efer; } static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, @@ -540,6 +541,8 @@ static void init_vmcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; + svm->vcpu.fpu_active = 1; + control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | INTERCEPT_CR4_MASK; @@ -552,13 +555,19 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept_dr_read = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | - INTERCEPT_DR3_MASK; + INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | + INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | + INTERCEPT_DR7_MASK; control->intercept_dr_write = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | INTERCEPT_DR7_MASK; control->intercept_exceptions = (1 << PF_VECTOR) | @@ -569,6 +578,7 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept = (1ULL << INTERCEPT_INTR) | (1ULL << INTERCEPT_NMI) | (1ULL << INTERCEPT_SMI) | + (1ULL << INTERCEPT_SELECTIVE_CR0) | (1ULL << INTERCEPT_CPUID) | (1ULL << INTERCEPT_INVD) | (1ULL << INTERCEPT_HLT) | @@ -641,10 +651,8 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | (1ULL << INTERCEPT_INVLPG)); control->intercept_exceptions &= ~(1 << PF_VECTOR); - control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| - INTERCEPT_CR3_MASK); - control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| - INTERCEPT_CR3_MASK); + control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; + control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; save->g_pat = 0x0007040600070406ULL; save->cr3 = 0; save->cr4 = 0; @@ -698,29 +706,28 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (err) goto free_svm; + err = -ENOMEM; page = alloc_page(GFP_KERNEL); - if (!page) { - err = -ENOMEM; + if (!page) goto uninit; - } - err = -ENOMEM; msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!msrpm_pages) - goto uninit; + goto free_page1; nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!nested_msrpm_pages) - goto uninit; - - svm->msrpm = page_address(msrpm_pages); - svm_vcpu_init_msrpm(svm->msrpm); + goto free_page2; hsave_page = alloc_page(GFP_KERNEL); if (!hsave_page) - goto uninit; + goto free_page3; + svm->nested.hsave = page_address(hsave_page); + svm->msrpm = page_address(msrpm_pages); + svm_vcpu_init_msrpm(svm->msrpm); + svm->nested.msrpm = page_address(nested_msrpm_pages); svm->vmcb = page_address(page); @@ -730,13 +737,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) init_vmcb(svm); fx_init(&svm->vcpu); - svm->vcpu.fpu_active = 1; svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; return &svm->vcpu; +free_page3: + __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); +free_page2: + __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); +free_page1: + __free_page(page); uninit: kvm_vcpu_uninit(&svm->vcpu); free_svm: @@ -765,14 +777,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (unlikely(cpu != vcpu->cpu)) { u64 delta; - /* - * Make sure that the guest sees a monotonically - * increasing TSC. - */ - delta = vcpu->arch.host_tsc - native_read_tsc(); - svm->vmcb->control.tsc_offset += delta; - if (is_nested(svm)) - svm->nested.hsave->control.tsc_offset += delta; + if (check_tsc_unstable()) { + /* + * Make sure that the guest sees a monotonically + * increasing TSC. + */ + delta = vcpu->arch.host_tsc - native_read_tsc(); + svm->vmcb->control.tsc_offset += delta; + if (is_nested(svm)) + svm->nested.hsave->control.tsc_offset += delta; + } vcpu->cpu = cpu; kvm_migrate_timers(vcpu); svm->asid_generation = 0; @@ -954,42 +968,59 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) svm->vmcb->save.gdtr.base = dt->base ; } +static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ +} + static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { } +static void update_cr0_intercept(struct vcpu_svm *svm) +{ + ulong gcr0 = svm->vcpu.arch.cr0; + u64 *hcr0 = &svm->vmcb->save.cr0; + + if (!svm->vcpu.fpu_active) + *hcr0 |= SVM_CR0_SELECTIVE_MASK; + else + *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) + | (gcr0 & SVM_CR0_SELECTIVE_MASK); + + + if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { + svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; + svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; + } else { + svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; + svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; + } +} + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer |= EFER_LMA; + vcpu->arch.efer |= EFER_LMA; svm->vmcb->save.efer |= EFER_LMA | EFER_LME; } if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer &= ~EFER_LMA; + vcpu->arch.efer &= ~EFER_LMA; svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); } } #endif - if (npt_enabled) - goto set; + vcpu->arch.cr0 = cr0; - if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { - svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - vcpu->fpu_active = 1; - } + if (!npt_enabled) + cr0 |= X86_CR0_PG | X86_CR0_WP; - vcpu->arch.cr0 = cr0; - cr0 |= X86_CR0_PG | X86_CR0_WP; - if (!vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); + if (!vcpu->fpu_active) cr0 |= X86_CR0_TS; - } -set: /* * re-enable caching here because the QEMU bios * does not do it - this results in some delay at @@ -997,6 +1028,7 @@ set: */ cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; + update_cr0_intercept(svm); } static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1102,76 +1134,70 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->vmcb->control.asid = sd->next_asid++; } -static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) +static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest) { struct vcpu_svm *svm = to_svm(vcpu); - unsigned long val; switch (dr) { case 0 ... 3: - val = vcpu->arch.db[dr]; + *dest = vcpu->arch.db[dr]; break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 6: if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - val = vcpu->arch.dr6; + *dest = vcpu->arch.dr6; else - val = svm->vmcb->save.dr6; + *dest = svm->vmcb->save.dr6; break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 7: if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - val = vcpu->arch.dr7; + *dest = vcpu->arch.dr7; else - val = svm->vmcb->save.dr7; + *dest = svm->vmcb->save.dr7; break; - default: - val = 0; } - return val; + return EMULATE_DONE; } -static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, - int *exception) +static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); - *exception = 0; - switch (dr) { case 0 ... 3: vcpu->arch.db[dr] = value; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = value; - return; - case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) - *exception = UD_VECTOR; - return; + break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 6: - if (value & 0xffffffff00000000ULL) { - *exception = GP_VECTOR; - return; - } vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; - return; + break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 7: - if (value & 0xffffffff00000000ULL) { - *exception = GP_VECTOR; - return; - } vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { svm->vmcb->save.dr7 = vcpu->arch.dr7; vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); } - return; - default: - /* FIXME: Possible case? */ - printk(KERN_DEBUG "%s: unexpected dr %u\n", - __func__, dr); - *exception = UD_VECTOR; - return; + break; } + + return EMULATE_DONE; } static int pf_interception(struct vcpu_svm *svm) @@ -1239,13 +1265,17 @@ static int ud_interception(struct vcpu_svm *svm) return 1; } -static int nm_interception(struct vcpu_svm *svm) +static void svm_fpu_activate(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) - svm->vmcb->save.cr0 &= ~X86_CR0_TS; svm->vcpu.fpu_active = 1; + update_cr0_intercept(svm); +} +static int nm_interception(struct vcpu_svm *svm) +{ + svm_fpu_activate(&svm->vcpu); return 1; } @@ -1337,7 +1367,7 @@ static int vmmcall_interception(struct vcpu_svm *svm) static int nested_svm_check_permissions(struct vcpu_svm *svm) { - if (!(svm->vcpu.arch.shadow_efer & EFER_SVME) + if (!(svm->vcpu.arch.efer & EFER_SVME) || !is_paging(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; @@ -1740,8 +1770,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) hsave->save.ds = vmcb->save.ds; hsave->save.gdtr = vmcb->save.gdtr; hsave->save.idtr = vmcb->save.idtr; - hsave->save.efer = svm->vcpu.arch.shadow_efer; - hsave->save.cr0 = svm->vcpu.arch.cr0; + hsave->save.efer = svm->vcpu.arch.efer; + hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; hsave->save.rflags = vmcb->save.rflags; hsave->save.rip = svm->next_rip; @@ -2153,9 +2183,10 @@ static int rdmsr_interception(struct vcpu_svm *svm) u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data; - if (svm_get_msr(&svm->vcpu, ecx, &data)) + if (svm_get_msr(&svm->vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); - else { + } else { trace_kvm_msr_read(ecx, data); svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; @@ -2247,13 +2278,15 @@ static int wrmsr_interception(struct vcpu_svm *svm) u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); - trace_kvm_msr_write(ecx, data); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) + if (svm_set_msr(&svm->vcpu, ecx, data)) { + trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); - else + } else { + trace_kvm_msr_write(ecx, data); skip_emulated_instruction(&svm->vcpu); + } return 1; } @@ -2297,7 +2330,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR3] = emulate_on_interception, [SVM_EXIT_READ_CR4] = emulate_on_interception, [SVM_EXIT_READ_CR8] = emulate_on_interception, - /* for now: */ + [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, [SVM_EXIT_WRITE_CR0] = emulate_on_interception, [SVM_EXIT_WRITE_CR3] = emulate_on_interception, [SVM_EXIT_WRITE_CR4] = emulate_on_interception, @@ -2306,11 +2339,17 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_DR1] = emulate_on_interception, [SVM_EXIT_READ_DR2] = emulate_on_interception, [SVM_EXIT_READ_DR3] = emulate_on_interception, + [SVM_EXIT_READ_DR4] = emulate_on_interception, + [SVM_EXIT_READ_DR5] = emulate_on_interception, + [SVM_EXIT_READ_DR6] = emulate_on_interception, + [SVM_EXIT_READ_DR7] = emulate_on_interception, [SVM_EXIT_WRITE_DR0] = emulate_on_interception, [SVM_EXIT_WRITE_DR1] = emulate_on_interception, [SVM_EXIT_WRITE_DR2] = emulate_on_interception, [SVM_EXIT_WRITE_DR3] = emulate_on_interception, + [SVM_EXIT_WRITE_DR4] = emulate_on_interception, [SVM_EXIT_WRITE_DR5] = emulate_on_interception, + [SVM_EXIT_WRITE_DR6] = emulate_on_interception, [SVM_EXIT_WRITE_DR7] = emulate_on_interception, [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, @@ -2383,20 +2422,10 @@ static int handle_exit(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); - if (npt_enabled) { - int mmu_reload = 0; - if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { - svm_set_cr0(vcpu, svm->vmcb->save.cr0); - mmu_reload = 1; - } + if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) vcpu->arch.cr0 = svm->vmcb->save.cr0; + if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; - if (mmu_reload) { - kvm_mmu_reset_context(vcpu); - kvm_mmu_load(vcpu); - } - } - if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -2798,12 +2827,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = root; force_new_asid(vcpu); - - if (vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); - svm->vmcb->save.cr0 |= X86_CR0_TS; - vcpu->fpu_active = 0; - } } static int is_disabled(void) @@ -2852,6 +2875,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return 0; } +static void svm_cpuid_update(struct kvm_vcpu *vcpu) +{ +} + static const struct trace_print_flags svm_exit_reasons_str[] = { { SVM_EXIT_READ_CR0, "read_cr0" }, { SVM_EXIT_READ_CR3, "read_cr3" }, @@ -2905,9 +2932,22 @@ static const struct trace_print_flags svm_exit_reasons_str[] = { { -1, NULL } }; -static bool svm_gb_page_enable(void) +static int svm_get_lpage_level(void) { - return true; + return PT_PDPE_LEVEL; +} + +static bool svm_rdtscp_supported(void) +{ + return false; +} + +static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + update_cr0_intercept(svm); + svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; } static struct kvm_x86_ops svm_x86_ops = { @@ -2936,6 +2976,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, + .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr3 = svm_set_cr3, @@ -2950,6 +2991,8 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .fpu_activate = svm_fpu_activate, + .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, @@ -2975,7 +3018,11 @@ static struct kvm_x86_ops svm_x86_ops = { .get_mt_mask = svm_get_mt_mask, .exit_reasons_str = svm_exit_reasons_str, - .gb_page_enable = svm_gb_page_enable, + .get_lpage_level = svm_get_lpage_level, + + .cpuid_update = svm_cpuid_update, + + .rdtscp_supported = svm_rdtscp_supported, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 816e0449db0..6ad30a29f04 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -56,6 +56,38 @@ TRACE_EVENT(kvm_hypercall, ); /* + * Tracepoint for hypercall. + */ +TRACE_EVENT(kvm_hv_hypercall, + TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx, + __u64 ingpa, __u64 outgpa), + TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), + + TP_STRUCT__entry( + __field( __u16, code ) + __field( bool, fast ) + __field( __u16, rep_cnt ) + __field( __u16, rep_idx ) + __field( __u64, ingpa ) + __field( __u64, outgpa ) + ), + + TP_fast_assign( + __entry->code = code; + __entry->fast = fast; + __entry->rep_cnt = rep_cnt; + __entry->rep_idx = rep_idx; + __entry->ingpa = ingpa; + __entry->outgpa = outgpa; + ), + + TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", + __entry->code, __entry->fast ? "fast" : "slow", + __entry->rep_cnt, __entry->rep_idx, __entry->ingpa, + __entry->outgpa) +); + +/* * Tracepoint for PIO. */ TRACE_EVENT(kvm_pio, @@ -214,28 +246,33 @@ TRACE_EVENT(kvm_page_fault, * Tracepoint for guest MSR access. */ TRACE_EVENT(kvm_msr, - TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), - TP_ARGS(rw, ecx, data), + TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception), + TP_ARGS(write, ecx, data, exception), TP_STRUCT__entry( - __field( unsigned int, rw ) - __field( unsigned int, ecx ) - __field( unsigned long, data ) + __field( unsigned, write ) + __field( u32, ecx ) + __field( u64, data ) + __field( u8, exception ) ), TP_fast_assign( - __entry->rw = rw; + __entry->write = write; __entry->ecx = ecx; __entry->data = data; + __entry->exception = exception; ), - TP_printk("msr_%s %x = 0x%lx", - __entry->rw ? "write" : "read", - __entry->ecx, __entry->data) + TP_printk("msr_%s %x = 0x%llx%s", + __entry->write ? "write" : "read", + __entry->ecx, __entry->data, + __entry->exception ? " (#GP)" : "") ); -#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) -#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) +#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false) +#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false) +#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true) +#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true) /* * Tracepoint for guest CR access. diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d4918d6fc92..bc933cfb4e6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -26,6 +26,7 @@ #include <linux/sched.h> #include <linux/moduleparam.h> #include <linux/ftrace_event.h> +#include <linux/slab.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -61,6 +62,23 @@ module_param_named(unrestricted_guest, static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); +#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK \ + (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE) +#define KVM_VM_CR0_ALWAYS_ON \ + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_CR4_GUEST_OWNED_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT) + +#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) + +#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -115,7 +133,7 @@ struct vcpu_vmx { } host_state; struct { int vm86_active; - u8 save_iopl; + ulong save_rflags; struct kvm_save_segment { u16 selector; unsigned long base; @@ -136,6 +154,8 @@ struct vcpu_vmx { ktime_t entry_time; s64 vnmi_blocked_time; u32 exit_reason; + + bool rdtscp_enabled; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -210,7 +230,7 @@ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif - MSR_EFER, MSR_K6_STAR, + MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) @@ -301,6 +321,11 @@ static inline bool cpu_has_vmx_ept_2m_page(void) return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); } +static inline bool cpu_has_vmx_ept_1g_page(void) +{ + return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT); +} + static inline int cpu_has_vmx_invept_individual_addr(void) { return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); @@ -336,9 +361,7 @@ static inline int cpu_has_vmx_ple(void) static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { - return flexpriority_enabled && - (cpu_has_vmx_virtualize_apic_accesses()) && - (irqchip_in_kernel(kvm)); + return flexpriority_enabled && irqchip_in_kernel(kvm); } static inline int cpu_has_vmx_vpid(void) @@ -347,6 +370,12 @@ static inline int cpu_has_vmx_vpid(void) SECONDARY_EXEC_ENABLE_VPID; } +static inline int cpu_has_vmx_rdtscp(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDTSCP; +} + static inline int cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; @@ -551,22 +580,18 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); - if (!vcpu->fpu_active) - eb |= 1u << NM_VECTOR; - /* - * Unconditionally intercept #DB so we can maintain dr6 without - * reading it every exit. - */ - eb |= 1u << DB_VECTOR; - if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - eb |= 1u << BP_VECTOR; - } + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | + (1u << NM_VECTOR) | (1u << DB_VECTOR); + if ((vcpu->guest_debug & + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) + eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ + if (vcpu->fpu_active) + eb &= ~(1u << NM_VECTOR); vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -589,7 +614,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) u64 guest_efer; u64 ignore_bits; - guest_efer = vmx->vcpu.arch.shadow_efer; + guest_efer = vmx->vcpu.arch.efer; /* * NX is emulated; LMA and LME handled by hardware; SCE meaninless @@ -767,38 +792,51 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) static void vmx_fpu_activate(struct kvm_vcpu *vcpu) { + ulong cr0; + if (vcpu->fpu_active) return; vcpu->fpu_active = 1; - vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); - if (vcpu->arch.cr0 & X86_CR0_TS) - vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + cr0 = vmcs_readl(GUEST_CR0); + cr0 &= ~(X86_CR0_TS | X86_CR0_MP); + cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); + vmcs_writel(GUEST_CR0, cr0); update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); + static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) { - if (!vcpu->fpu_active) - return; - vcpu->fpu_active = 0; - vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + vmx_decache_cr0_guest_bits(vcpu); + vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = 0; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); } static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { - unsigned long rflags; + unsigned long rflags, save_rflags; rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) - rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); + if (to_vmx(vcpu)->rmode.vm86_active) { + rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + save_rflags = to_vmx(vcpu)->rmode.save_rflags; + rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + } return rflags; } static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - if (to_vmx(vcpu)->rmode.vm86_active) + if (to_vmx(vcpu)->rmode.vm86_active) { + to_vmx(vcpu)->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + } vmcs_writel(GUEST_RFLAGS, rflags); } @@ -878,6 +916,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); } +static bool vmx_rdtscp_supported(void) +{ + return cpu_has_vmx_rdtscp(); +} + /* * Swap MSR entry in host/guest MSR entry array. */ @@ -913,12 +956,15 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __find_msr_index(vmx, MSR_CSTAR); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_TSC_AUX); + if (index >= 0 && vmx->rdtscp_enabled) + move_msr_up(vmx, index, save_nmsrs++); /* * MSR_K6_STAR is only needed on long mode guests, and only * if efer.sce is enabled. */ index = __find_msr_index(vmx, MSR_K6_STAR); - if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) + if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) move_msr_up(vmx, index, save_nmsrs++); } #endif @@ -1002,6 +1048,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_TSC_AUX: + if (!to_vmx(vcpu)->rdtscp_enabled) + return 1; + /* Otherwise falls through */ default: vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); @@ -1065,7 +1115,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vcpu->arch.pat = data; break; } - /* Otherwise falls through to kvm_set_msr_common */ + ret = kvm_set_msr_common(vcpu, msr_index, data); + break; + case MSR_TSC_AUX: + if (!vmx->rdtscp_enabled) + return 1; + /* Check reserved bit, higher 32 bits should be zero */ + if ((data >> 32) != 0) + return 1; + /* Otherwise falls through */ default: msr = find_msr_entry(vmx, msr_index); if (msr) { @@ -1224,6 +1282,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING; opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | @@ -1243,7 +1303,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_PAUSE_LOOP_EXITING; + SECONDARY_EXEC_PAUSE_LOOP_EXITING | + SECONDARY_EXEC_RDTSCP; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -1429,8 +1490,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); flags = vmcs_readl(GUEST_RFLAGS); - flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); - flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); + flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | @@ -1457,8 +1518,12 @@ static void enter_pmode(struct kvm_vcpu *vcpu) static gva_t rmode_tss_base(struct kvm *kvm) { if (!kvm->arch.tss_addr) { - gfn_t base_gfn = kvm->memslots[0].base_gfn + - kvm->memslots[0].npages - 3; + struct kvm_memslots *slots; + gfn_t base_gfn; + + slots = rcu_dereference(kvm->memslots); + base_gfn = kvm->memslots->memslots[0].base_gfn + + kvm->memslots->memslots[0].npages - 3; return base_gfn << PAGE_SHIFT; } return kvm->arch.tss_addr; @@ -1499,8 +1564,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); - vmx->rmode.save_iopl - = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + vmx->rmode.save_rflags = flags; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; @@ -1544,9 +1608,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) * of this msr depends on is_long_mode(). */ vmx_load_host_state(to_vmx(vcpu)); - vcpu->arch.shadow_efer = efer; - if (!msr) - return; + vcpu->arch.efer = efer; if (efer & EFER_LMA) { vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) | @@ -1576,13 +1638,13 @@ static void enter_lmode(struct kvm_vcpu *vcpu) (guest_tr_ar & ~AR_TYPE_MASK) | AR_TYPE_BUSY_64_TSS); } - vcpu->arch.shadow_efer |= EFER_LMA; - vmx_set_efer(vcpu, vcpu->arch.shadow_efer); + vcpu->arch.efer |= EFER_LMA; + vmx_set_efer(vcpu, vcpu->arch.efer); } static void exit_lmode(struct kvm_vcpu *vcpu) { - vcpu->arch.shadow_efer &= ~EFER_LMA; + vcpu->arch.efer &= ~EFER_LMA; vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) @@ -1598,10 +1660,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ + ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + + vcpu->arch.cr0 &= ~cr0_guest_owned_bits; + vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; +} + static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { - vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; + ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + + vcpu->arch.cr4 &= ~cr4_guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; } static void ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -1646,7 +1718,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, @@ -1654,23 +1726,13 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } if (!(cr0 & X86_CR0_WP)) *hw_cr0 &= ~X86_CR0_WP; } -static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, - struct kvm_vcpu *vcpu) -{ - if (!is_paging(vcpu)) { - *hw_cr4 &= ~X86_CR4_PAE; - *hw_cr4 |= X86_CR4_PSE; - } else if (!(vcpu->arch.cr4 & X86_CR4_PAE)) - *hw_cr4 &= ~X86_CR4_PAE; -} - static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1682,8 +1744,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) else hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; - vmx_fpu_deactivate(vcpu); - if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -1691,7 +1751,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) enter_rmode(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) @@ -1702,12 +1762,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (enable_ept) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + if (!vcpu->fpu_active) + hw_cr0 |= X86_CR0_TS | X86_CR0_MP; + vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; - - if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) - vmx_fpu_activate(vcpu); } static u64 construct_eptp(unsigned long root_hpa) @@ -1738,8 +1798,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, guest_cr3); - if (vcpu->arch.cr0 & X86_CR0_PE) - vmx_fpu_deactivate(vcpu); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1748,8 +1806,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); vcpu->arch.cr4 = cr4; - if (enable_ept) - ept_update_paging_mode_cr4(&hw_cr4, vcpu); + if (enable_ept) { + if (!is_paging(vcpu)) { + hw_cr4 &= ~X86_CR4_PAE; + hw_cr4 |= X86_CR4_PSE; + } else if (!(cr4 & X86_CR4_PAE)) { + hw_cr4 &= ~X86_CR4_PAE; + } + } vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); @@ -1787,7 +1851,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static int vmx_get_cpl(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ + if (!is_protmode(vcpu)) return 0; if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ @@ -2042,7 +2106,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) static bool guest_state_valid(struct kvm_vcpu *vcpu) { /* real mode guest state checks */ - if (!(vcpu->arch.cr0 & X86_CR0_PE)) { + if (!is_protmode(vcpu)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) @@ -2175,7 +2239,7 @@ static int alloc_apic_access_page(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_page) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; @@ -2188,7 +2252,7 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2197,7 +2261,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (kvm->arch.ept_identity_pagetable) goto out; kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; @@ -2212,7 +2276,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2384,14 +2448,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) for (i = 0; i < NR_VMX_MSR; ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; - u64 data; int j = vmx->nmsrs; if (rdmsr_safe(index, &data_low, &data_high) < 0) continue; if (wrmsr_safe(index, data_low, data_high) < 0) continue; - data = data_low | ((u64)data_high << 32); vmx->guest_msrs[j].index = i; vmx->guest_msrs[j].data = 0; vmx->guest_msrs[j].mask = -1ull; @@ -2404,7 +2466,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); - vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + if (enable_ept) + vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; rdtscll(tsc_this); @@ -2429,10 +2494,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 msr; - int ret; + int ret, idx; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; goto out; @@ -2526,7 +2591,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ + vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); vmx_set_efer(&vmx->vcpu, 0); vmx_fpu_activate(&vmx->vcpu); @@ -2540,7 +2605,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->emulation_required = 0; out: - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); return ret; } @@ -2717,6 +2782,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, kvm_queue_exception(vcpu, vec); return 1; case BP_VECTOR: + /* + * Update instruction length as we may reinject the exception + * from user space while in guest debugging mode. + */ + to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) return 0; /* fall through */ @@ -2839,6 +2910,13 @@ static int handle_exception(struct kvm_vcpu *vcpu) kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); /* fall through */ case BP_VECTOR: + /* + * Update instruction length as we may reinject #BP from + * user space while in guest debugging mode. Reading it for + * #DB as well causes no harm, it is not used in that case. + */ + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; @@ -2940,11 +3018,10 @@ static int handle_cr(struct kvm_vcpu *vcpu) }; break; case 2: /* clts */ - vmx_fpu_deactivate(vcpu); - vcpu->arch.cr0 &= ~X86_CR0_TS; - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); - vmx_fpu_activate(vcpu); + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); skip_emulated_instruction(vcpu); + vmx_fpu_activate(vcpu); return 1; case 1: /*mov from cr*/ switch (cr) { @@ -2962,7 +3039,9 @@ static int handle_cr(struct kvm_vcpu *vcpu) } break; case 3: /* lmsw */ - kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; + trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); + kvm_lmsw(vcpu, val); skip_emulated_instruction(vcpu); return 1; @@ -2975,12 +3054,22 @@ static int handle_cr(struct kvm_vcpu *vcpu) return 0; } +static int check_dr_alias(struct kvm_vcpu *vcpu) +{ + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return -1; + } + return 0; +} + static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; unsigned long val; int dr, reg; + /* Do not handle if the CPL > 0, will trigger GP on re-entry */ if (!kvm_require_cpl(vcpu, 0)) return 1; dr = vmcs_readl(GUEST_DR7); @@ -3016,14 +3105,20 @@ static int handle_dr(struct kvm_vcpu *vcpu) case 0 ... 3: val = vcpu->arch.db[dr]; break; + case 4: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ case 6: val = vcpu->arch.dr6; break; - case 7: + case 5: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ + default: /* 7 */ val = vcpu->arch.dr7; break; - default: - val = 0; } kvm_register_write(vcpu, reg, val); } else { @@ -3034,21 +3129,25 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = val; break; - case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) - kvm_queue_exception(vcpu, UD_VECTOR); - break; + case 4: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ case 6: if (val & 0xffffffff00000000ULL) { - kvm_queue_exception(vcpu, GP_VECTOR); - break; + kvm_inject_gp(vcpu, 0); + return 1; } vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; break; - case 7: + case 5: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ + default: /* 7 */ if (val & 0xffffffff00000000ULL) { - kvm_queue_exception(vcpu, GP_VECTOR); - break; + kvm_inject_gp(vcpu, 0); + return 1; } vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { @@ -3075,6 +3174,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) u64 data; if (vmx_get_msr(vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; } @@ -3094,13 +3194,13 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - trace_kvm_msr_write(ecx, data); - if (vmx_set_msr(vcpu, ecx, data) != 0) { + trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; } + trace_kvm_msr_write(ecx, data); skip_emulated_instruction(vcpu); return 1; } @@ -3385,7 +3485,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) } if (err != EMULATE_DONE) { - kvm_report_emulation_failure(vcpu, "emulation failure"); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; @@ -3416,6 +3515,12 @@ static int handle_pause(struct kvm_vcpu *vcpu) return 1; } +static int handle_invalid_op(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -3453,6 +3558,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, }; static const int kvm_vmx_max_exit_handlers = @@ -3686,9 +3793,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ vmcs_writel(HOST_CR0, read_cr0()); - if (vcpu->arch.switch_db_regs) - set_debugreg(vcpu->arch.dr6, 6); - asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" @@ -3789,9 +3893,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_PDPTR)); vcpu->arch.regs_dirty = 0; - if (vcpu->arch.switch_db_regs) - get_debugreg(vcpu->arch.dr6, 6); - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); @@ -3920,7 +4021,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) * b. VT-d with snooping control feature: snooping control feature of * VT-d engine can guarantee the cache correctness. Just set it * to WB to keep consistent with host. So the same as item 3. - * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep + * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep * consistent with host MTRR */ if (is_mmio) @@ -3931,37 +4032,88 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) VMX_EPT_MT_EPTE_SHIFT; else ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) - | VMX_EPT_IGMT_BIT; + | VMX_EPT_IPAT_BIT; return ret; } +#define _ER(x) { EXIT_REASON_##x, #x } + static const struct trace_print_flags vmx_exit_reasons_str[] = { - { EXIT_REASON_EXCEPTION_NMI, "exception" }, - { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, - { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, - { EXIT_REASON_NMI_WINDOW, "nmi_window" }, - { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, - { EXIT_REASON_CR_ACCESS, "cr_access" }, - { EXIT_REASON_DR_ACCESS, "dr_access" }, - { EXIT_REASON_CPUID, "cpuid" }, - { EXIT_REASON_MSR_READ, "rdmsr" }, - { EXIT_REASON_MSR_WRITE, "wrmsr" }, - { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, - { EXIT_REASON_HLT, "halt" }, - { EXIT_REASON_INVLPG, "invlpg" }, - { EXIT_REASON_VMCALL, "hypercall" }, - { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, - { EXIT_REASON_APIC_ACCESS, "apic_access" }, - { EXIT_REASON_WBINVD, "wbinvd" }, - { EXIT_REASON_TASK_SWITCH, "task_switch" }, - { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, + _ER(EXCEPTION_NMI), + _ER(EXTERNAL_INTERRUPT), + _ER(TRIPLE_FAULT), + _ER(PENDING_INTERRUPT), + _ER(NMI_WINDOW), + _ER(TASK_SWITCH), + _ER(CPUID), + _ER(HLT), + _ER(INVLPG), + _ER(RDPMC), + _ER(RDTSC), + _ER(VMCALL), + _ER(VMCLEAR), + _ER(VMLAUNCH), + _ER(VMPTRLD), + _ER(VMPTRST), + _ER(VMREAD), + _ER(VMRESUME), + _ER(VMWRITE), + _ER(VMOFF), + _ER(VMON), + _ER(CR_ACCESS), + _ER(DR_ACCESS), + _ER(IO_INSTRUCTION), + _ER(MSR_READ), + _ER(MSR_WRITE), + _ER(MWAIT_INSTRUCTION), + _ER(MONITOR_INSTRUCTION), + _ER(PAUSE_INSTRUCTION), + _ER(MCE_DURING_VMENTRY), + _ER(TPR_BELOW_THRESHOLD), + _ER(APIC_ACCESS), + _ER(EPT_VIOLATION), + _ER(EPT_MISCONFIG), + _ER(WBINVD), { -1, NULL } }; -static bool vmx_gb_page_enable(void) +#undef _ER + +static int vmx_get_lpage_level(void) { - return false; + if (enable_ept && !cpu_has_vmx_ept_1g_page()) + return PT_DIRECTORY_LEVEL; + else + /* For shadow and EPT supported 1GB page */ + return PT_PDPE_LEVEL; +} + +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + +static void vmx_cpuid_update(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exec_control; + + vmx->rdtscp_enabled = false; + if (vmx_rdtscp_supported()) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + if (exec_control & SECONDARY_EXEC_RDTSCP) { + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) + vmx->rdtscp_enabled = true; + else { + exec_control &= ~SECONDARY_EXEC_RDTSCP; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } + } + } } static struct kvm_x86_ops vmx_x86_ops = { @@ -3990,6 +4142,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_segment = vmx_set_segment, .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, @@ -4002,6 +4155,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .fpu_activate = vmx_fpu_activate, + .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, @@ -4027,7 +4182,11 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_mt_mask = vmx_get_mt_mask, .exit_reasons_str = vmx_exit_reasons_str, - .gb_page_enable = vmx_gb_page_enable, + .get_lpage_level = vmx_get_lpage_level, + + .cpuid_update = vmx_cpuid_update, + + .rdtscp_supported = vmx_rdtscp_supported, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9d068966fb2..3c4ca98ad27 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -38,6 +38,8 @@ #include <linux/intel-iommu.h> #include <linux/cpufreq.h> #include <linux/user-return-notifier.h> +#include <linux/srcu.h> +#include <linux/slab.h> #include <trace/events/kvm.h> #undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS @@ -93,16 +95,16 @@ module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); struct kvm_shared_msrs_global { int nr; - struct kvm_shared_msr { - u32 msr; - u64 value; - } msrs[KVM_NR_SHARED_MSRS]; + u32 msrs[KVM_NR_SHARED_MSRS]; }; struct kvm_shared_msrs { struct user_return_notifier urn; bool registered; - u64 current_value[KVM_NR_SHARED_MSRS]; + struct kvm_shared_msr_values { + u64 host; + u64 curr; + } values[KVM_NR_SHARED_MSRS]; }; static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; @@ -147,53 +149,64 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; - struct kvm_shared_msr *global; struct kvm_shared_msrs *locals = container_of(urn, struct kvm_shared_msrs, urn); + struct kvm_shared_msr_values *values; for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - global = &shared_msrs_global.msrs[slot]; - if (global->value != locals->current_value[slot]) { - wrmsrl(global->msr, global->value); - locals->current_value[slot] = global->value; + values = &locals->values[slot]; + if (values->host != values->curr) { + wrmsrl(shared_msrs_global.msrs[slot], values->host); + values->curr = values->host; } } locals->registered = false; user_return_notifier_unregister(urn); } -void kvm_define_shared_msr(unsigned slot, u32 msr) +static void shared_msr_update(unsigned slot, u32 msr) { - int cpu; + struct kvm_shared_msrs *smsr; u64 value; + smsr = &__get_cpu_var(shared_msrs); + /* only read, and nobody should modify it at this time, + * so don't need lock */ + if (slot >= shared_msrs_global.nr) { + printk(KERN_ERR "kvm: invalid MSR slot!"); + return; + } + rdmsrl_safe(msr, &value); + smsr->values[slot].host = value; + smsr->values[slot].curr = value; +} + +void kvm_define_shared_msr(unsigned slot, u32 msr) +{ if (slot >= shared_msrs_global.nr) shared_msrs_global.nr = slot + 1; - shared_msrs_global.msrs[slot].msr = msr; - rdmsrl_safe(msr, &value); - shared_msrs_global.msrs[slot].value = value; - for_each_online_cpu(cpu) - per_cpu(shared_msrs, cpu).current_value[slot] = value; + shared_msrs_global.msrs[slot] = msr; + /* we need ensured the shared_msr_global have been updated */ + smp_wmb(); } EXPORT_SYMBOL_GPL(kvm_define_shared_msr); static void kvm_shared_msr_cpu_online(void) { unsigned i; - struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs); for (i = 0; i < shared_msrs_global.nr; ++i) - locals->current_value[i] = shared_msrs_global.msrs[i].value; + shared_msr_update(i, shared_msrs_global.msrs[i]); } void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) { struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); - if (((value ^ smsr->current_value[slot]) & mask) == 0) + if (((value ^ smsr->values[slot].curr) & mask) == 0) return; - smsr->current_value[slot] = value; - wrmsrl(shared_msrs_global.msrs[slot].msr, value); + smsr->values[slot].curr = value; + wrmsrl(shared_msrs_global.msrs[slot], value); if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); @@ -257,12 +270,68 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); +#define EXCPT_BENIGN 0 +#define EXCPT_CONTRIBUTORY 1 +#define EXCPT_PF 2 + +static int exception_class(int vector) +{ + switch (vector) { + case PF_VECTOR: + return EXCPT_PF; + case DE_VECTOR: + case TS_VECTOR: + case NP_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + return EXCPT_CONTRIBUTORY; + default: + break; + } + return EXCPT_BENIGN; +} + +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, + unsigned nr, bool has_error, u32 error_code) +{ + u32 prev_nr; + int class1, class2; + + if (!vcpu->arch.exception.pending) { + queue: + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = has_error; + vcpu->arch.exception.nr = nr; + vcpu->arch.exception.error_code = error_code; + return; + } + + /* to check exception */ + prev_nr = vcpu->arch.exception.nr; + if (prev_nr == DF_VECTOR) { + /* triple fault -> shutdown */ + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return; + } + class1 = exception_class(prev_nr); + class2 = exception_class(nr); + if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) + || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { + /* generate double fault per SDM Table 5-5 */ + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = true; + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + } else + /* replace previous exception with a new one in a hope + that instruction re-execution will regenerate lost + exception */ + goto queue; +} + void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = false; - vcpu->arch.exception.nr = nr; + kvm_multiple_exception(vcpu, nr, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception); @@ -270,25 +339,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, u32 error_code) { ++vcpu->stat.pf_guest; - - if (vcpu->arch.exception.pending) { - switch(vcpu->arch.exception.nr) { - case DF_VECTOR: - /* triple fault -> shutdown */ - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); - return; - case PF_VECTOR: - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - return; - default: - /* replace previous exception with a new one in a hope - that instruction re-execution will regenerate lost - exception */ - vcpu->arch.exception.pending = false; - break; - } - } vcpu->arch.cr2 = addr; kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } @@ -301,11 +351,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = true; - vcpu->arch.exception.nr = nr; - vcpu->arch.exception.error_code = error_code; + kvm_multiple_exception(vcpu, nr, true, error_code); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); @@ -383,41 +429,38 @@ out: void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { - if (cr0 & CR0_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", - cr0, vcpu->arch.cr0); + cr0 |= X86_CR0_ET; + +#ifdef CONFIG_X86_64 + if (cr0 & 0xffffffff00000000UL) { kvm_inject_gp(vcpu, 0); return; } +#endif + + cr0 &= ~CR0_RESERVED_BITS; if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { - printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); kvm_inject_gp(vcpu, 0); return; } if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { - printk(KERN_DEBUG "set_cr0: #GP, set PG flag " - "and a clear PE flag\n"); kvm_inject_gp(vcpu, 0); return; } if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 - if ((vcpu->arch.shadow_efer & EFER_LME)) { + if ((vcpu->arch.efer & EFER_LME)) { int cs_db, cs_l; if (!is_pae(vcpu)) { - printk(KERN_DEBUG "set_cr0: #GP, start paging " - "in long mode while PAE is disabled\n"); kvm_inject_gp(vcpu, 0); return; } kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); if (cs_l) { - printk(KERN_DEBUG "set_cr0: #GP, start paging " - "in long mode while CS.L == 1\n"); kvm_inject_gp(vcpu, 0); return; @@ -425,8 +468,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } else #endif if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_DEBUG "set_cr0: #GP, pdptrs " - "reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } @@ -443,38 +484,33 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); + kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long old_cr4 = vcpu->arch.cr4; + unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; if (cr4 & CR4_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) { - printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " - "in long mode\n"); kvm_inject_gp(vcpu, 0); return; } } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (cr4 & X86_CR4_VMXE) { - printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); kvm_inject_gp(vcpu, 0); return; } @@ -495,21 +531,16 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (is_long_mode(vcpu)) { if (cr3 & CR3_L_MODE_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } } else { if (is_pae(vcpu)) { if (cr3 & CR3_PAE_RESERVED_BITS) { - printk(KERN_DEBUG - "set_cr3: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { - printk(KERN_DEBUG "set_cr3: #GP, pdptrs " - "reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } @@ -541,7 +572,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr3); void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); kvm_inject_gp(vcpu, 0); return; } @@ -575,9 +605,11 @@ static inline u32 bit(int bitno) * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 2 +#define KVM_SAVE_MSRS_BEGIN 5 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_K6_STAR, #ifdef CONFIG_X86_64 @@ -595,15 +627,12 @@ static u32 emulated_msrs[] = { static void set_efer(struct kvm_vcpu *vcpu, u64 efer) { if (efer & efer_reserved_bits) { - printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", - efer); kvm_inject_gp(vcpu, 0); return; } if (is_paging(vcpu) - && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { - printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); + && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { kvm_inject_gp(vcpu, 0); return; } @@ -613,7 +642,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { - printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); kvm_inject_gp(vcpu, 0); return; } @@ -624,7 +652,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { - printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); kvm_inject_gp(vcpu, 0); return; } @@ -633,9 +660,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) kvm_x86_ops->set_efer(vcpu, efer); efer &= ~EFER_LMA; - efer |= vcpu->arch.shadow_efer & EFER_LMA; + efer |= vcpu->arch.efer & EFER_LMA; - vcpu->arch.shadow_efer = efer; + vcpu->arch.efer = efer; vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; kvm_mmu_reset_context(vcpu); @@ -670,7 +697,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { static int version; struct pvclock_wall_clock wc; - struct timespec now, sys, boot; + struct timespec boot; if (!wall_clock) return; @@ -685,9 +712,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ - now = current_kernel_time(); - ktime_get_ts(&sys); - boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); + getboottime(&boot); wc.sec = boot.tv_sec; wc.nsec = boot.tv_nsec; @@ -762,6 +787,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) local_irq_save(flags); kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); ktime_get_ts(&ts); + monotonic_to_bootbased(&ts); local_irq_restore(flags); /* With all the info we got, fill in the values */ @@ -914,9 +940,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MC0_CTL + 4 * bank_num) { u32 offset = msr - MSR_IA32_MC0_CTL; - /* only 0 or all 1s can be written to IA32_MCi_CTL */ + /* only 0 or all 1s can be written to IA32_MCi_CTL + * some Linux kernels though clear bit 10 in bank 4 to + * workaround a BIOS/GART TBL issue on AMD K8s, ignore + * this to avoid an uncatched #GP in the guest + */ if ((offset & 0x3) == 0 && - data != 0 && data != ~(u64)0) + data != 0 && (data | (1 << 10)) != ~(u64)0) return -1; vcpu->arch.mce_banks[offset] = data; break; @@ -958,6 +988,100 @@ out: return r; } +static bool kvm_hv_hypercall_enabled(struct kvm *kvm) +{ + return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; +} + +static bool kvm_hv_msr_partition_wide(u32 msr) +{ + bool r = false; + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + r = true; + break; + } + + return r; +} + +static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + kvm->arch.hv_guest_os_id = data; + /* setting guest os id to zero disables hypercall page */ + if (!kvm->arch.hv_guest_os_id) + kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; + break; + case HV_X64_MSR_HYPERCALL: { + u64 gfn; + unsigned long addr; + u8 instructions[4]; + + /* if guest os id is not set hypercall should remain disabled */ + if (!kvm->arch.hv_guest_os_id) + break; + if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { + kvm->arch.hv_hypercall = data; + break; + } + gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return 1; + kvm_x86_ops->patch_hypercall(vcpu, instructions); + ((unsigned char *)instructions)[3] = 0xc3; /* ret */ + if (copy_to_user((void __user *)addr, instructions, 4)) + return 1; + kvm->arch.hv_hypercall = data; + break; + } + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + return 0; +} + +static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + switch (msr) { + case HV_X64_MSR_APIC_ASSIST_PAGE: { + unsigned long addr; + + if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { + vcpu->arch.hv_vapic = data; + break; + } + addr = gfn_to_hva(vcpu->kvm, data >> + HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); + if (kvm_is_error_hva(addr)) + return 1; + if (clear_user((void __user *)addr, PAGE_SIZE)) + return 1; + vcpu->arch.hv_vapic = data; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + + return 0; +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -1072,6 +1196,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = set_msr_hyperv_pw(vcpu, msr, data); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return set_msr_hyperv(vcpu, msr, data); + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1171,6 +1305,54 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } +static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + data = kvm->arch.hv_guest_os_id; + break; + case HV_X64_MSR_HYPERCALL: + data = kvm->arch.hv_hypercall; + break; + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + + *pdata = data; + return 0; +} + +static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + + switch (msr) { + case HV_X64_MSR_VP_INDEX: { + int r; + struct kvm_vcpu *v; + kvm_for_each_vcpu(r, v, vcpu->kvm) + if (v == vcpu) + data = r; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata = data; + return 0; +} + int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; @@ -1222,7 +1404,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data |= (((uint64_t)4ULL) << 40); break; case MSR_EFER: - data = vcpu->arch.shadow_efer; + data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: data = vcpu->kvm->arch.wall_clock; @@ -1237,6 +1419,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: return get_msr_mce(vcpu, msr, pdata); + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = get_msr_hyperv_pw(vcpu, msr, pdata); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return get_msr_hyperv(vcpu, msr, pdata); + break; default: if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); @@ -1262,15 +1454,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { - int i; + int i, idx; vcpu_load(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); vcpu_put(vcpu); @@ -1352,6 +1544,11 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XEN_HVM: case KVM_CAP_ADJUST_CLOCK: case KVM_CAP_VCPU_EVENTS: + case KVM_CAP_HYPERV: + case KVM_CAP_HYPERV_VAPIC: + case KVM_CAP_HYPERV_SPIN: + case KVM_CAP_PCI_SEGMENT: + case KVM_CAP_X86_ROBUST_SINGLESTEP: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1465,8 +1662,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); + kvm_x86_ops->vcpu_put(vcpu); } static int is_efer_nx(void) @@ -1531,6 +1728,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, cpuid_fix_nx_cap(vcpu); r = 0; kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); out_free: vfree(cpuid_entries); @@ -1553,6 +1751,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, goto out; vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); return 0; out: @@ -1595,12 +1794,15 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index, int *nent, int maxnent) { unsigned f_nx = is_efer_nx() ? F(NX) : 0; - unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; #ifdef CONFIG_X86_64 + unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) + ? F(GBPAGES) : 0; unsigned f_lm = F(LM); #else + unsigned f_gbpages = 0; unsigned f_lm = 0; #endif + unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -1620,7 +1822,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* Reserved */ | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | + F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = @@ -1867,7 +2069,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, return 0; if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || - !(vcpu->arch.cr4 & X86_CR4_MCE)) { + !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { printk(KERN_DEBUG "kvm: set_mce: " "injects mce exception while " "previous one is in progress!\n"); @@ -1913,7 +2115,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->sipi_vector = vcpu->arch.sipi_vector; - events->flags = 0; + events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING + | KVM_VCPUEVENT_VALID_SIPI_VECTOR); vcpu_put(vcpu); } @@ -1921,7 +2124,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { - if (events->flags) + if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING + | KVM_VCPUEVENT_VALID_SIPI_VECTOR)) return -EINVAL; vcpu_load(vcpu); @@ -1938,10 +2142,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, kvm_pic_clear_isr_ack(vcpu->kvm); vcpu->arch.nmi_injected = events->nmi.injected; - vcpu->arch.nmi_pending = events->nmi.pending; + if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) + vcpu->arch.nmi_pending = events->nmi.pending; kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); - vcpu->arch.sipi_vector = events->sipi_vector; + if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) + vcpu->arch.sipi_vector = events->sipi_vector; vcpu_put(vcpu); @@ -2157,14 +2363,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); spin_lock(&kvm->mmu_lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; spin_unlock(&kvm->mmu_lock); - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return 0; } @@ -2173,13 +2379,35 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) return kvm->arch.n_alloc_mmu_pages; } +gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; + + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; + if (alias->flags & KVM_ALIAS_INVALID) + continue; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) { int i; struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; - for (i = 0; i < kvm->arch.naliases; ++i) { - alias = &kvm->arch.aliases[i]; + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; if (gfn >= alias->base_gfn && gfn < alias->base_gfn + alias->npages) return alias->target_gfn + gfn - alias->base_gfn; @@ -2197,6 +2425,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, { int r, n; struct kvm_mem_alias *p; + struct kvm_mem_aliases *aliases, *old_aliases; r = -EINVAL; /* General sanity checks */ @@ -2213,26 +2442,48 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, < alias->target_phys_addr) goto out; - down_write(&kvm->slots_lock); - spin_lock(&kvm->mmu_lock); + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out; + + mutex_lock(&kvm->slots_lock); - p = &kvm->arch.aliases[alias->slot]; + /* invalidate any gfn reference in case of deletion/shrinking */ + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + synchronize_srcu_expedited(&kvm->srcu); + kvm_mmu_zap_all(kvm); + kfree(old_aliases); + + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out_unlock; + + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + + p = &aliases->aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; p->npages = alias->memory_size >> PAGE_SHIFT; p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; + p->flags &= ~(KVM_ALIAS_INVALID); for (n = KVM_ALIAS_SLOTS; n > 0; --n) - if (kvm->arch.aliases[n - 1].npages) + if (aliases->aliases[n - 1].npages) break; - kvm->arch.naliases = n; + aliases->naliases = n; - spin_unlock(&kvm->mmu_lock); - kvm_mmu_zap_all(kvm); - - up_write(&kvm->slots_lock); - - return 0; + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + synchronize_srcu_expedited(&kvm->srcu); + kfree(old_aliases); + r = 0; +out_unlock: + mutex_unlock(&kvm->slots_lock); out: return r; } @@ -2270,18 +2521,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: r = kvm_set_ioapic(kvm, &chip->chip.ioapic); @@ -2361,29 +2612,63 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - int r; - int n; + int r, i; struct kvm_memory_slot *memslot; - int is_dirty = 0; + unsigned long n; + unsigned long is_dirty = 0; + unsigned long *dirty_bitmap = NULL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); - r = kvm_get_dirty_log(kvm, log, &is_dirty); - if (r) + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = &kvm->memslots->memslots[log->slot]; + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + + r = -ENOMEM; + dirty_bitmap = vmalloc(n); + if (!dirty_bitmap) goto out; + memset(dirty_bitmap, 0, n); + + for (i = 0; !is_dirty && i < n/sizeof(long); i++) + is_dirty = memslot->dirty_bitmap[i]; /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { + struct kvm_memslots *slots, *old_slots; + spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); - memslot = &kvm->memslots[log->slot]; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; - memset(memslot->dirty_bitmap, 0, n); + + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; + + old_slots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + synchronize_srcu_expedited(&kvm->srcu); + dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; + kfree(old_slots); } + r = 0; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) + r = -EFAULT; +out_free: + vfree(dirty_bitmap); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2466,6 +2751,8 @@ long kvm_arch_vm_ioctl(struct file *filp, if (vpic) { r = kvm_ioapic_init(kvm); if (r) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev); kfree(vpic); goto create_irqchip_unlock; } @@ -2477,10 +2764,8 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_setup_default_irq_routing(kvm); if (r) { mutex_lock(&kvm->irq_lock); - kfree(kvm->arch.vpic); - kfree(kvm->arch.vioapic); - kvm->arch.vpic = NULL; - kvm->arch.vioapic = NULL; + kvm_ioapic_destroy(kvm); + kvm_destroy_pic(kvm); mutex_unlock(&kvm->irq_lock); } create_irqchip_unlock: @@ -2496,7 +2781,7 @@ long kvm_arch_vm_ioctl(struct file *filp, sizeof(struct kvm_pit_config))) goto out; create_pit: - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; @@ -2505,7 +2790,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.vpit) r = 0; create_pit_unlock: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); break; case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { @@ -2722,7 +3007,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) return 0; - return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); + return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) @@ -2731,17 +3016,44 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) return 0; - return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); + return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + + gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_FETCH_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_WRITE_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +/* uses this to access any guest's mapped memory without checking CPL */ +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); +} + +static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 access, + u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -2764,14 +3076,37 @@ out: return r; } +/* used for instruction fetching */ +static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, + access | PFERR_FETCH_MASK, error); +} + +static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, + error); +} + +static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); +} + static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) + struct kvm_vcpu *vcpu, u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -2801,6 +3136,7 @@ static int emulator_read_emulated(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -2810,17 +3146,20 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); + return X86EMUL_PROPAGATE_FAULT; + } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (kvm_read_guest_virt(addr, val, bytes, vcpu) + if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; mmio: /* @@ -2859,11 +3198,12 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, 2); + kvm_inject_page_fault(vcpu, addr, error_code); return X86EMUL_PROPAGATE_FAULT; } @@ -2927,7 +3267,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, char *kaddr; u64 val; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); if (gpa == UNMAPPED_GVA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -2964,35 +3304,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_clts(struct kvm_vcpu *vcpu) { - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + kvm_x86_ops->fpu_activate(vcpu); return X86EMUL_CONTINUE; } int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { - struct kvm_vcpu *vcpu = ctxt->vcpu; - - switch (dr) { - case 0 ... 3: - *dest = kvm_x86_ops->get_dr(vcpu, dr); - return X86EMUL_CONTINUE; - default: - pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); - return X86EMUL_UNHANDLEABLE; - } + return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest); } int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - int exception; - kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); - if (exception) { - /* FIXME: better handling */ - return X86EMUL_UNHANDLEABLE; - } - return X86EMUL_CONTINUE; + return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask); } void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) @@ -3006,7 +3332,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); @@ -3014,7 +3340,8 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); static struct x86_emulate_ops emulate_ops = { - .read_std = kvm_read_guest_virt, + .read_std = kvm_read_guest_virt_system, + .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, @@ -3057,8 +3384,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu, vcpu->arch.emulate_ctxt.vcpu = vcpu; vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_REAL : cs_l + ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; @@ -3150,12 +3478,17 @@ static int pio_copy_data(struct kvm_vcpu *vcpu) gva_t q = vcpu->arch.pio.guest_gva; unsigned bytes; int ret; + u32 error_code; bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; if (vcpu->arch.pio.in) - ret = kvm_write_guest_virt(q, p, bytes, vcpu); + ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code); else - ret = kvm_read_guest_virt(q, p, bytes, vcpu); + ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code); + + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, q, error_code); + return ret; } @@ -3176,7 +3509,7 @@ int complete_pio(struct kvm_vcpu *vcpu) if (io->in) { r = pio_copy_data(vcpu); if (r) - return r; + goto out; } delta = 1; @@ -3203,7 +3536,7 @@ int complete_pio(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RSI, val); } } - +out: io->count -= io->cur_count; io->cur_count = 0; @@ -3216,11 +3549,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) int r; if (vcpu->arch.pio.in) - r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, + r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, vcpu->arch.pio.size, pd); else - r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); + r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, + pd); return r; } @@ -3231,7 +3565,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu) int i, r = 0; for (i = 0; i < io->cur_count; i++) { - if (kvm_io_bus_write(&vcpu->kvm->pio_bus, + if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, io->port, io->size, pd)) { r = -EOPNOTSUPP; break; @@ -3245,6 +3579,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) { unsigned long val; + trace_kvm_pio(!in, port, size, 1); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -3256,11 +3592,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) vcpu->arch.pio.down = 0; vcpu->arch.pio.rep = 0; - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, 1); - - val = kvm_register_read(vcpu, VCPU_REGS_RAX); - memcpy(vcpu->arch.pio_data, &val, 4); + if (!vcpu->arch.pio.in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(vcpu->arch.pio_data, &val, 4); + } if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { complete_pio(vcpu); @@ -3277,6 +3612,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, unsigned now, in_page; int ret = 0; + trace_kvm_pio(!in, port, size, count); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -3288,9 +3625,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, vcpu->arch.pio.down = down; vcpu->arch.pio.rep = rep; - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, count); - if (!count) { kvm_x86_ops->skip_emulated_instruction(vcpu); return 1; @@ -3322,10 +3656,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); - if (ret == X86EMUL_PROPAGATE_FAULT) { - kvm_inject_gp(vcpu, 0); + if (ret == X86EMUL_PROPAGATE_FAULT) return 1; - } if (ret == 0 && !pio_string_write(vcpu)) { complete_pio(vcpu); if (vcpu->arch.pio.count == 0) @@ -3484,11 +3816,76 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, return a0 | ((gpa_t)a1 << 32); } +int kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ + u64 param, ingpa, outgpa, ret; + uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; + bool fast, longmode; + int cs_db, cs_l; + + /* + * hypercall generates UD from non zero cpl and real mode + * per HYPER-V spec + */ + if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + longmode = is_long_mode(vcpu) && cs_l == 1; + + if (!longmode) { + param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); + ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); + outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); + } +#ifdef CONFIG_X86_64 + else { + param = kvm_register_read(vcpu, VCPU_REGS_RCX); + ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); + outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + } +#endif + + code = param & 0xffff; + fast = (param >> 16) & 0x1; + rep_cnt = (param >> 32) & 0xfff; + rep_idx = (param >> 48) & 0xfff; + + trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + + switch (code) { + case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: + kvm_vcpu_on_spin(vcpu); + break; + default: + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } + + ret = res | (((u64)rep_done & 0xfff) << 32); + if (longmode) { + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + } else { + kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); + } + + return 1; +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; int r = 1; + if (kvm_hv_hypercall_enabled(vcpu->kvm)) + return kvm_hv_hypercall(vcpu); + nr = kvm_register_read(vcpu, VCPU_REGS_RAX); a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -3531,10 +3928,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); int kvm_fix_hypercall(struct kvm_vcpu *vcpu) { char instruction[3]; - int ret = 0; unsigned long rip = kvm_rip_read(vcpu); - /* * Blow out the MMU to ensure that no other VCPU has an active mapping * to ensure that the updated hypercall appears atomically across all @@ -3543,11 +3938,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_mmu_zap_all(vcpu->kvm); kvm_x86_ops->patch_hypercall(vcpu, instruction); - if (emulator_write_emulated(rip, instruction, 3, vcpu) - != X86EMUL_CONTINUE) - ret = -EFAULT; - return ret; + return emulator_write_emulated(rip, instruction, 3, vcpu); } static u64 mk_cr_64(u64 curr_cr, u32 new_val) @@ -3580,10 +3972,9 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) { unsigned long value; - kvm_x86_ops->decache_cr4_guest_bits(vcpu); switch (cr) { case 0: - value = vcpu->arch.cr0; + value = kvm_read_cr0(vcpu); break; case 2: value = vcpu->arch.cr2; @@ -3592,7 +3983,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) value = vcpu->arch.cr3; break; case 4: - value = vcpu->arch.cr4; + value = kvm_read_cr4(vcpu); break; case 8: value = kvm_get_cr8(vcpu); @@ -3610,7 +4001,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, { switch (cr) { case 0: - kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); + kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); *rflags = kvm_get_rflags(vcpu); break; case 2: @@ -3620,7 +4011,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, kvm_set_cr3(vcpu, val); break; case 4: - kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); + kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); break; case 8: kvm_set_cr8(vcpu, val & 0xfUL); @@ -3687,6 +4078,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, } return best; } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { @@ -3770,14 +4162,15 @@ static void vapic_enter(struct kvm_vcpu *vcpu) static void vapic_exit(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; + int idx; if (!apic || !apic->vapic_addr) return; - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_release_page_dirty(apic->vapic_page); mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -3873,12 +4266,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } + if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { + vcpu->fpu_active = 0; + kvm_x86_ops->fpu_deactivate(vcpu); + } } preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); - kvm_load_guest_fpu(vcpu); + if (vcpu->fpu_active) + kvm_load_guest_fpu(vcpu); local_irq_disable(); @@ -3906,7 +4304,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_lapic_sync_to_vapic(vcpu); } - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_guest_enter(); @@ -3948,7 +4346,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_enable(); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); /* * Profile KVM exit RIPs: @@ -3970,6 +4368,7 @@ out: static int __vcpu_run(struct kvm_vcpu *vcpu) { int r; + struct kvm *kvm = vcpu->kvm; if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { pr_debug("vcpu %d received sipi with vector # %x\n", @@ -3981,7 +4380,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); vapic_enter(vcpu); r = 1; @@ -3989,9 +4388,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) r = vcpu_enter_guest(vcpu); else { - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) { switch(vcpu->arch.mp_state) { @@ -4026,13 +4425,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) ++vcpu->stat.signal_exits; } if (need_resched()) { - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); post_kvm_run_save(vcpu); vapic_exit(vcpu); @@ -4062,7 +4461,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_set_cr8(vcpu, kvm_run->cr8); if (vcpu->arch.pio.cur_count) { + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = complete_pio(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r) goto out; } @@ -4071,10 +4472,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->mmio_read_completed = 1; vcpu->mmio_needed = 0; - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, EMULTYPE_NO_DECODE); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r == EMULATE_DO_MMIO) { /* * Read-modify-write. Back to userspace. @@ -4201,13 +4602,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->gdt.limit = dt.limit; sregs->gdt.base = dt.base; - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - sregs->cr0 = vcpu->arch.cr0; + sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr2 = vcpu->arch.cr2; sregs->cr3 = vcpu->arch.cr3; - sregs->cr4 = vcpu->arch.cr4; + sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); - sregs->efer = vcpu->arch.shadow_efer; + sregs->efer = vcpu->arch.efer; sregs->apic_base = kvm_get_apic_base(vcpu); memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); @@ -4295,14 +4695,23 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, { struct descriptor_table dtable; u16 index = selector >> 3; + int ret; + u32 err; + gva_t addr; get_segment_descriptor_dtable(vcpu, selector, &dtable); if (dtable.limit < index * 8 + 7) { kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); - return 1; + return X86EMUL_PROPAGATE_FAULT; } - return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); + addr = dtable.base + index * 8; + ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc), + vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, addr, err); + + return ret; } /* allowed just for 8 bytes segments */ @@ -4316,15 +4725,23 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (dtable.limit < index * 8 + 7) return 1; - return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); + return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL); +} + +static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu, + struct desc_struct *seg_desc) +{ + u32 base_addr = get_desc_base(seg_desc); + + return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL); } -static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, +static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) { u32 base_addr = get_desc_base(seg_desc); - return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); + return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL); } static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) @@ -4335,18 +4752,6 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) return kvm_seg.selector; } -static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, - u16 selector, - struct kvm_segment *kvm_seg) -{ - struct desc_struct seg_desc; - - if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) - return 1; - seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); - return 0; -} - static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment segvar = { @@ -4364,7 +4769,7 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se .unusable = 0, }; kvm_x86_ops->set_segment(vcpu, &segvar, seg); - return 0; + return X86EMUL_CONTINUE; } static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) @@ -4374,24 +4779,112 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); } -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg) +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment kvm_seg; + struct desc_struct seg_desc; + u8 dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + u32 err_code = 0; + bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; - if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) + if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) return kvm_load_realmode_segment(vcpu, selector, seg); - if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) - return 1; - kvm_seg.type |= type_bits; - if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && - seg != VCPU_SREG_LDTR) - if (!kvm_seg.s) - kvm_seg.unusable = 1; + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc); + if (ret) + return ret; + + seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); + + if (null_selector) { /* for NULL selector skip all following checks */ + kvm_seg.unusable = 1; + goto load; + } + + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; + + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !kvm_seg.s) + goto exception; + if (!kvm_seg.present) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + + rpl = selector & 3; + dpl = kvm_seg.dpl; + cpl = kvm_x86_ops->get_cpl(vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl) + goto exception; + break; + case VCPU_SREG_CS: + if (!(kvm_seg.type & 8)) + goto exception; + + if (kvm_seg.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; + break; + case VCPU_SREG_TR: + if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (kvm_seg.s || kvm_seg.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ + /* + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) + */ + if ((kvm_seg.type & 0xa) == 0x8 || + (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl))) + goto exception; + break; + } + + if (!kvm_seg.unusable && kvm_seg.s) { + /* mark segment as accessed */ + kvm_seg.type |= 1; + seg_desc.type |= 1; + save_guest_segment_descriptor(vcpu, selector, &seg_desc); + } +load: kvm_set_segment(vcpu, &kvm_seg, seg); - return 0; + return X86EMUL_CONTINUE; +exception: + kvm_queue_exception_e(vcpu, err_vec, err_code); + return X86EMUL_PROPAGATE_FAULT; } static void save_state_to_tss32(struct kvm_vcpu *vcpu, @@ -4417,6 +4910,14 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); } +static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg) +{ + struct kvm_segment kvm_seg; + kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_seg.selector = sel; + kvm_set_segment(vcpu, &kvm_seg, seg); +} + static int load_state_from_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) { @@ -4434,25 +4935,41 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); - if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS); + kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) + if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) + if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS)) return 1; return 0; } @@ -4492,19 +5009,33 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu, kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); - if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) return 1; return 0; } @@ -4526,7 +5057,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, sizeof tss_segment_16)) goto out; - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), &tss_segment_16, sizeof tss_segment_16)) goto out; @@ -4534,7 +5065,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, tss_segment_16.prev_task_link = old_tss_sel; if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), + get_tss_base_addr_write(vcpu, nseg_desc), &tss_segment_16.prev_task_link, sizeof tss_segment_16.prev_task_link)) goto out; @@ -4565,7 +5096,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, sizeof tss_segment_32)) goto out; - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), &tss_segment_32, sizeof tss_segment_32)) goto out; @@ -4573,7 +5104,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, tss_segment_32.prev_task_link = old_tss_sel; if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), + get_tss_base_addr_write(vcpu, nseg_desc), &tss_segment_32.prev_task_link, sizeof tss_segment_32.prev_task_link)) goto out; @@ -4595,8 +5126,9 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) int ret = 0; u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); + u32 desc_limit; - old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); + old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); /* FIXME: Handle errors. Failure to read either TSS or their * descriptors should generate a pagefault. @@ -4617,7 +5149,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) } } - if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { + desc_limit = get_desc_limit(&nseg_desc); + if (!nseg_desc.p || + ((desc_limit < 0x67 && (nseg_desc.type & 8)) || + desc_limit < 0x2b)) { kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); return 1; } @@ -4655,7 +5190,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) &nseg_desc); } - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS); seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); tr_seg.type = 11; kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); @@ -4686,17 +5221,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_set_cr8(vcpu, sregs->cr8); - mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; + mmu_reset_needed |= vcpu->arch.efer != sregs->efer; kvm_x86_ops->set_efer(vcpu, sregs->efer); kvm_set_apic_base(vcpu, sregs->apic_base); - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - - mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; + mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; kvm_x86_ops->set_cr0(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; - mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; + mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (!is_long_mode(vcpu) && is_pae(vcpu)) { load_pdptrs(vcpu, vcpu->arch.cr3); @@ -4731,7 +5264,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Older userspace won't unhalt the vcpu on reset. */ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && - !(vcpu->arch.cr0 & X86_CR0_PE)) + !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; vcpu_put(vcpu); @@ -4829,11 +5362,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, { unsigned long vaddr = tr->linear_address; gpa_t gpa; + int idx; vcpu_load(vcpu); - down_read(&vcpu->kvm->slots_lock); - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); - up_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; @@ -4914,14 +5448,14 @@ EXPORT_SYMBOL_GPL(fx_init); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) + if (vcpu->guest_fpu_loaded) return; vcpu->guest_fpu_loaded = 1; kvm_fx_save(&vcpu->arch.host_fx_image); kvm_fx_restore(&vcpu->arch.guest_fx_image); + trace_kvm_fpu(1); } -EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { @@ -4932,8 +5466,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) kvm_fx_save(&vcpu->arch.guest_fx_image); kvm_fx_restore(&vcpu->arch.host_fx_image); ++vcpu->stat.fpu_reload; + set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); + trace_kvm_fpu(0); } -EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { @@ -5068,12 +5603,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) GFP_KERNEL); if (!vcpu->arch.mce_banks) { r = -ENOMEM; - goto fail_mmu_destroy; + goto fail_free_lapic; } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; return 0; - +fail_free_lapic: + kvm_free_lapic(vcpu); fail_mmu_destroy: kvm_mmu_destroy(vcpu); fail_free_pio_data: @@ -5084,10 +5620,13 @@ fail: void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { + int idx; + + kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); } @@ -5098,6 +5637,12 @@ struct kvm *kvm_arch_create_vm(void) if (!kvm) return ERR_PTR(-ENOMEM); + kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!kvm->arch.aliases) { + kfree(kvm); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); @@ -5154,16 +5699,18 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); + cleanup_srcu_struct(&kvm->srcu); + kfree(kvm->arch.aliases); kfree(kvm); } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, int user_alloc) { - int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; + int npages = memslot->npages; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. @@ -5183,26 +5730,35 @@ int kvm_arch_set_memory_region(struct kvm *kvm, if (IS_ERR((void *)userspace_addr)) return PTR_ERR((void *)userspace_addr); - /* set userspace_addr atomically for kvm_hva_to_rmapp */ - spin_lock(&kvm->mmu_lock); memslot->userspace_addr = userspace_addr; - spin_unlock(&kvm->mmu_lock); - } else { - if (!old.user_alloc && old.rmap) { - int ret; - - down_write(¤t->mm->mmap_sem); - ret = do_munmap(current->mm, old.userspace_addr, - old.npages * PAGE_SIZE); - up_write(¤t->mm->mmap_sem); - if (ret < 0) - printk(KERN_WARNING - "kvm_vm_ioctl_set_memory_region: " - "failed to munmap memory\n"); - } } } + + return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + + int npages = mem->memory_size >> PAGE_SHIFT; + + if (!user_alloc && !old.user_alloc && old.rmap && !npages) { + int ret; + + down_write(¤t->mm->mmap_sem); + ret = do_munmap(current->mm, old.userspace_addr, + old.npages * PAGE_SIZE); + up_write(¤t->mm->mmap_sem); + if (ret < 0) + printk(KERN_WARNING + "kvm_vm_ioctl_set_memory_region: " + "failed to munmap memory\n"); + } + spin_lock(&kvm->mmu_lock); if (!kvm->arch.n_requested_mmu_pages) { unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); @@ -5211,8 +5767,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm, kvm_mmu_slot_remove_write_access(kvm, mem->slot); spin_unlock(&kvm->mmu_lock); - - return 0; } void kvm_arch_flush_shadow(struct kvm *kvm) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 5eadea585d2..2d101639bd8 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -2,6 +2,7 @@ #define ARCH_X86_KVM_X86_H #include <linux/kvm_host.h> +#include "kvm_cache_regs.h" static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { @@ -35,4 +36,33 @@ static inline bool kvm_exception_is_soft(unsigned int nr) struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); +static inline bool is_protmode(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PE); +} + +static inline int is_long_mode(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + return vcpu->arch.efer & EFER_LMA; +#else + return 0; +#endif +} + +static inline int is_pae(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); +} + +static inline int is_pse(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); +} + +static inline int is_paging(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PG); +} + #endif diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 7e59dc1d3fc..2bdf628066b 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -115,7 +115,7 @@ static void async_hcall(unsigned long call, unsigned long arg1, local_irq_save(flags); if (lguest_data.hcall_status[next_call] != 0xFF) { /* Table full, so do normal hcall which will flush table. */ - kvm_hypercall4(call, arg1, arg2, arg3, arg4); + hcall(call, arg1, arg2, arg3, arg4); } else { lguest_data.hcalls[next_call].arg0 = call; lguest_data.hcalls[next_call].arg1 = arg1; @@ -145,46 +145,45 @@ static void async_hcall(unsigned long call, unsigned long arg1, * So, when we're in lazy mode, we call async_hcall() to store the call for * future processing: */ -static void lazy_hcall1(unsigned long call, - unsigned long arg1) +static void lazy_hcall1(unsigned long call, unsigned long arg1) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - kvm_hypercall1(call, arg1); + hcall(call, arg1, 0, 0, 0); else async_hcall(call, arg1, 0, 0, 0); } /* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ static void lazy_hcall2(unsigned long call, - unsigned long arg1, - unsigned long arg2) + unsigned long arg1, + unsigned long arg2) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - kvm_hypercall2(call, arg1, arg2); + hcall(call, arg1, arg2, 0, 0); else async_hcall(call, arg1, arg2, 0, 0); } static void lazy_hcall3(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3) + unsigned long arg1, + unsigned long arg2, + unsigned long arg3) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - kvm_hypercall3(call, arg1, arg2, arg3); + hcall(call, arg1, arg2, arg3, 0); else async_hcall(call, arg1, arg2, arg3, 0); } #ifdef CONFIG_X86_PAE static void lazy_hcall4(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4) + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - kvm_hypercall4(call, arg1, arg2, arg3, arg4); + hcall(call, arg1, arg2, arg3, arg4); else async_hcall(call, arg1, arg2, arg3, arg4); } @@ -196,13 +195,13 @@ static void lazy_hcall4(unsigned long call, :*/ static void lguest_leave_lazy_mmu_mode(void) { - kvm_hypercall0(LHCALL_FLUSH_ASYNC); + hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); paravirt_leave_lazy_mmu(); } static void lguest_end_context_switch(struct task_struct *next) { - kvm_hypercall0(LHCALL_FLUSH_ASYNC); + hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); paravirt_end_context_switch(next); } @@ -286,7 +285,7 @@ static void lguest_write_idt_entry(gate_desc *dt, /* Keep the local copy up to date. */ native_write_idt_entry(dt, entrynum, g); /* Tell Host about this new entry. */ - kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); + hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0); } /* @@ -300,7 +299,7 @@ static void lguest_load_idt(const struct desc_ptr *desc) struct desc_struct *idt = (void *)desc->address; for (i = 0; i < (desc->size+1)/8; i++) - kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); + hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0); } /* @@ -321,7 +320,7 @@ static void lguest_load_gdt(const struct desc_ptr *desc) struct desc_struct *gdt = (void *)desc->address; for (i = 0; i < (desc->size+1)/8; i++) - kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); + hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0); } /* @@ -334,8 +333,8 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, { native_write_gdt_entry(dt, entrynum, desc, type); /* Tell Host about this new entry. */ - kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, entrynum, - dt[entrynum].a, dt[entrynum].b); + hcall(LHCALL_LOAD_GDT_ENTRY, entrynum, + dt[entrynum].a, dt[entrynum].b, 0); } /* @@ -931,7 +930,7 @@ static int lguest_clockevent_set_next_event(unsigned long delta, } /* Please wake us this far in the future. */ - kvm_hypercall1(LHCALL_SET_CLOCKEVENT, delta); + hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0); return 0; } @@ -942,7 +941,7 @@ static void lguest_clockevent_set_mode(enum clock_event_mode mode, case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: /* A 0 argument shuts the clock down. */ - kvm_hypercall0(LHCALL_SET_CLOCKEVENT); + hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0); break; case CLOCK_EVT_MODE_ONESHOT: /* This is what we expect. */ @@ -1100,7 +1099,7 @@ static void set_lguest_basic_apic_ops(void) /* STOP! Until an interrupt comes in. */ static void lguest_safe_halt(void) { - kvm_hypercall0(LHCALL_HALT); + hcall(LHCALL_HALT, 0, 0, 0, 0); } /* @@ -1112,8 +1111,8 @@ static void lguest_safe_halt(void) */ static void lguest_power_off(void) { - kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), - LGUEST_SHUTDOWN_POWEROFF); + hcall(LHCALL_SHUTDOWN, __pa("Power down"), + LGUEST_SHUTDOWN_POWEROFF, 0, 0); } /* @@ -1123,7 +1122,7 @@ static void lguest_power_off(void) */ static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) { - kvm_hypercall2(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF); + hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0); /* The hcall won't return, but to keep gcc happy, we're "done". */ return NOTIFY_DONE; } @@ -1162,7 +1161,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) len = sizeof(scratch) - 1; scratch[len] = '\0'; memcpy(scratch, buf, len); - kvm_hypercall1(LHCALL_NOTIFY, __pa(scratch)); + hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0, 0); /* This routine returns the number of bytes actually written. */ return len; @@ -1174,7 +1173,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) */ static void lguest_restart(char *reason) { - kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); + hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0); } /*G:050 diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 27eac0faee4..4f420c2f2d5 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -32,7 +32,7 @@ ENTRY(lguest_entry) */ movl $LHCALL_LGUEST_INIT, %eax movl $lguest_data - __PAGE_OFFSET, %ebx - .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ + int $LGUEST_TRAP_ENTRY /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 05d686bbbe9..3ac4a8ade62 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -14,7 +14,7 @@ $(obj)/inat.o: $(obj)/inat-tables.c clean-files := inat-tables.c -obj-$(CONFIG_SMP) += msr-smp.o +obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o lib-y += thunk_$(BITS).o @@ -35,9 +35,10 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y) endif lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o else - obj-y += io_64.o iomap_copy_64.o + obj-y += iomap_copy_64.o lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o lib-y += thunk_64.o clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o + lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o endif diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c new file mode 100644 index 00000000000..a3c66887503 --- /dev/null +++ b/arch/x86/lib/cache-smp.c @@ -0,0 +1,19 @@ +#include <linux/smp.h> +#include <linux/module.h> + +static void __wbinvd(void *dummy) +{ + wbinvd(); +} + +void wbinvd_on_cpu(int cpu) +{ + smp_call_function_single(cpu, __wbinvd, NULL, 1); +} +EXPORT_SYMBOL(wbinvd_on_cpu); + +int wbinvd_on_all_cpus(void) +{ + return on_each_cpu(__wbinvd, NULL, 1); +} +EXPORT_SYMBOL(wbinvd_on_all_cpus); diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c deleted file mode 100644 index 3f1eb59b5f0..00000000000 --- a/arch/x86/lib/io_64.c +++ /dev/null @@ -1,25 +0,0 @@ -#include <linux/string.h> -#include <linux/module.h> -#include <asm/io.h> - -void __memcpy_toio(unsigned long dst, const void *src, unsigned len) -{ - __inline_memcpy((void *)dst, src, len); -} -EXPORT_SYMBOL(__memcpy_toio); - -void __memcpy_fromio(void *dst, unsigned long src, unsigned len) -{ - __inline_memcpy(dst, (const void *)src, len); -} -EXPORT_SYMBOL(__memcpy_fromio); - -void memset_io(volatile void __iomem *a, int b, size_t c) -{ - /* - * TODO: memset can mangle the IO patterns quite a bit. - * perhaps it would be better to use a dumb one: - */ - memset((void *)a, b, c); -} -EXPORT_SYMBOL(memset_io); diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S new file mode 100644 index 00000000000..15acecf0d7a --- /dev/null +++ b/arch/x86/lib/rwsem_64.S @@ -0,0 +1,81 @@ +/* + * x86-64 rwsem wrappers + * + * This interfaces the inline asm code to the slow-path + * C routines. We need to save the call-clobbered regs + * that the asm does not mark as clobbered, and move the + * argument from %rax to %rdi. + * + * NOTE! We don't need to save %rax, because the functions + * will always return the semaphore pointer in %rax (which + * is also the input argument to these helpers) + * + * The following can clobber %rdx because the asm clobbers it: + * call_rwsem_down_write_failed + * call_rwsem_wake + * but %rdi, %rsi, %rcx, %r8-r11 always need saving. + */ + +#include <linux/linkage.h> +#include <asm/rwlock.h> +#include <asm/alternative-asm.h> +#include <asm/frame.h> +#include <asm/dwarf2.h> + +#define save_common_regs \ + pushq %rdi; \ + pushq %rsi; \ + pushq %rcx; \ + pushq %r8; \ + pushq %r9; \ + pushq %r10; \ + pushq %r11 + +#define restore_common_regs \ + popq %r11; \ + popq %r10; \ + popq %r9; \ + popq %r8; \ + popq %rcx; \ + popq %rsi; \ + popq %rdi + +/* Fix up special calling conventions */ +ENTRY(call_rwsem_down_read_failed) + save_common_regs + pushq %rdx + movq %rax,%rdi + call rwsem_down_read_failed + popq %rdx + restore_common_regs + ret + ENDPROC(call_rwsem_down_read_failed) + +ENTRY(call_rwsem_down_write_failed) + save_common_regs + movq %rax,%rdi + call rwsem_down_write_failed + restore_common_regs + ret + ENDPROC(call_rwsem_down_write_failed) + +ENTRY(call_rwsem_wake) + decw %dx /* do nothing if still outstanding active readers */ + jnz 1f + save_common_regs + movq %rax,%rdi + call rwsem_wake + restore_common_regs +1: ret + ENDPROC(call_rwsem_wake) + +/* Fix up special calling conventions */ +ENTRY(call_rwsem_downgrade_wake) + save_common_regs + pushq %rdx + movq %rax,%rdi + call rwsem_downgrade_wake + popq %rdx + restore_common_regs + ret + ENDPROC(call_rwsem_downgrade_wake) diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 71da1bca13c..738e6593799 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -18,7 +18,7 @@ static inline pte_t gup_get_pte(pte_t *ptep) #else /* * With get_user_pages_fast, we walk down the pagetables without taking - * any locks. For this we would like to load the pointers atoimcally, + * any locks. For this we would like to load the pointers atomically, * but that is not possible (without expensive cmpxchg8b) on PAE. What * we do have is the guarantee that a pte will only either go from not * present to present, or present to not present or both -- it will not diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index f46c340727b..069ce7c37c0 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -9,7 +9,6 @@ #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> -#include <linux/slab.h> #include <linux/err.h> #include <linux/sysctl.h> #include <asm/mman.h> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d406c523901..b278535b14a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1,3 +1,4 @@ +#include <linux/gfp.h> #include <linux/initrd.h> #include <linux/ioport.h> #include <linux/swap.h> @@ -266,16 +267,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, if (!after_bootmem) find_early_table_space(end, use_pse, use_gbpages); -#ifdef CONFIG_X86_32 - for (i = 0; i < nr_range; i++) - kernel_physical_mapping_init(mr[i].start, mr[i].end, - mr[i].page_size_mask); - ret = end; -#else /* CONFIG_X86_64 */ for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, mr[i].page_size_mask); -#endif #ifdef CONFIG_X86_32 early_ioremap_page_table_range_init(); @@ -338,11 +332,23 @@ int devmem_is_allowed(unsigned long pagenr) void free_init_pages(char *what, unsigned long begin, unsigned long end) { - unsigned long addr = begin; + unsigned long addr; + unsigned long begin_aligned, end_aligned; + + /* Make sure boundaries are page aligned */ + begin_aligned = PAGE_ALIGN(begin); + end_aligned = end & PAGE_MASK; - if (addr >= end) + if (WARN_ON(begin_aligned != begin || end_aligned != end)) { + begin = begin_aligned; + end = end_aligned; + } + + if (begin >= end) return; + addr = begin; + /* * If debugging page accesses then do not free this memory but * mark them not present - any buggy init-section access will @@ -350,7 +356,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) */ #ifdef CONFIG_DEBUG_PAGEALLOC printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", - begin, PAGE_ALIGN(end)); + begin, end); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); #else /* @@ -365,8 +371,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) for (; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); + memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); free_page(addr); totalram_pages++; } @@ -383,6 +388,15 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_init_pages("initrd memory", start, end); + /* + * end could be not aligned, and We can not align that, + * decompresser could be confused by aligned initrd_end + * We already reserve the end partial page before in + * - i386_start_kernel() + * - x86_64_start_kernel() + * - relocate_initrd() + * So here We can do PAGE_ALIGN() safely to get partial page to be freed + */ + free_init_pages("initrd memory", start, PAGE_ALIGN(end)); } #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c973f8e2a6c..bca79091b9d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -25,11 +25,11 @@ #include <linux/pfn.h> #include <linux/poison.h> #include <linux/bootmem.h> -#include <linux/slab.h> #include <linux/proc_fs.h> #include <linux/memory_hotplug.h> #include <linux/initrd.h> #include <linux/cpumask.h> +#include <linux/gfp.h> #include <asm/asm.h> #include <asm/bios_ebda.h> @@ -241,6 +241,7 @@ kernel_physical_mapping_init(unsigned long start, unsigned long page_size_mask) { int use_pse = page_size_mask == (1<<PG_LEVEL_2M); + unsigned long last_map_addr = end; unsigned long start_pfn, end_pfn; pgd_t *pgd_base = swapper_pg_dir; int pgd_idx, pmd_idx, pte_ofs; @@ -341,9 +342,10 @@ repeat: prot = PAGE_KERNEL_EXEC; pages_4k++; - if (mapping_iter == 1) + if (mapping_iter == 1) { set_pte(pte, pfn_pte(pfn, init_prot)); - else + last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE; + } else set_pte(pte, pfn_pte(pfn, prot)); } } @@ -368,7 +370,7 @@ repeat: mapping_iter = 2; goto repeat; } - return 0; + return last_map_addr; } pte_t *kmap_pte; @@ -748,6 +750,7 @@ static void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } +#ifndef CONFIG_NO_BOOTMEM static unsigned long __init setup_node_bootmem(int nodeid, unsigned long start_pfn, unsigned long end_pfn, @@ -764,13 +767,14 @@ static unsigned long __init setup_node_bootmem(int nodeid, printk(KERN_INFO " node %d bootmap %08lx - %08lx\n", nodeid, bootmap, bootmap + bootmap_size); free_bootmem_with_active_regions(nodeid, end_pfn); - early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); return bootmap + bootmap_size; } +#endif void __init setup_bootmem_allocator(void) { +#ifndef CONFIG_NO_BOOTMEM int nodeid; unsigned long bootmap_size, bootmap; /* @@ -782,11 +786,13 @@ void __init setup_bootmem_allocator(void) if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n", bootmap_size); reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); +#endif printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped<<PAGE_SHIFT); printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); +#ifndef CONFIG_NO_BOOTMEM for_each_online_node(nodeid) { unsigned long start_pfn, end_pfn; @@ -804,6 +810,7 @@ void __init setup_bootmem_allocator(void) bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, bootmap); } +#endif after_bootmem = 1; } @@ -892,8 +899,7 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10, - (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) - ); + totalhigh_pages << (PAGE_SHIFT-10)); printk(KERN_INFO "virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5198b9bb34e..ee41bba315d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -29,6 +29,7 @@ #include <linux/module.h> #include <linux/memory_hotplug.h> #include <linux/nmi.h> +#include <linux/gfp.h> #include <asm/processor.h> #include <asm/bios_ebda.h> @@ -49,6 +50,7 @@ #include <asm/numa.h> #include <asm/cacheflush.h> #include <asm/init.h> +#include <linux/bootmem.h> static unsigned long dma_reserve __initdata; @@ -571,6 +573,7 @@ kernel_physical_mapping_init(unsigned long start, void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, int acpi, int k8) { +#ifndef CONFIG_NO_BOOTMEM unsigned long bootmap_size, bootmap; bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; @@ -578,13 +581,15 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, PAGE_SIZE); if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n", bootmap_size); + reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); /* don't touch min_low_pfn */ bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, 0, end_pfn); e820_register_active_regions(0, start_pfn, end_pfn); free_bootmem_with_active_regions(0, end_pfn); - early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); - reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); +#else + e820_register_active_regions(0, start_pfn, end_pfn); +#endif } #endif @@ -616,6 +621,21 @@ void __init paging_init(void) */ #ifdef CONFIG_MEMORY_HOTPLUG /* + * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need + * updating. + */ +static void update_end_of_memory_vars(u64 start, u64 size) +{ + unsigned long end_pfn = PFN_UP(start + size); + + if (end_pfn > max_pfn) { + max_pfn = end_pfn; + max_low_pfn = end_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + } +} + +/* * Memory is added always to NORMAL zone. This means you will never get * additional DMA/DMA32 memory. */ @@ -634,6 +654,9 @@ int arch_add_memory(int nid, u64 start, u64 size) ret = __add_pages(nid, zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start, size); + return ret; } EXPORT_SYMBOL_GPL(arch_add_memory); @@ -955,7 +978,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) if (pmd_none(*pmd)) { pte_t entry; - p = vmemmap_alloc_block(PMD_SIZE, node); + p = vmemmap_alloc_block_buf(PMD_SIZE, node); if (!p) return -ENOMEM; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 03c75ffd5c2..5eb1ba74a3a 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -24,43 +24,6 @@ #include "physaddr.h" -int page_is_ram(unsigned long pagenr) -{ - resource_size_t addr, end; - int i; - - /* - * A special case is the first 4Kb of memory; - * This is a BIOS owned area, not kernel ram, but generally - * not listed as such in the E820 table. - */ - if (pagenr == 0) - return 0; - - /* - * Second special case: Some BIOSen report the PC BIOS - * area (640->1Mb) as ram even though it is not. - */ - if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) && - pagenr < (BIOS_END >> PAGE_SHIFT)) - return 0; - - for (i = 0; i < e820.nr_map; i++) { - /* - * Not usable memory: - */ - if (e820.map[i].type != E820_RAM) - continue; - addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT; - - - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; -} - /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index 4901d0dafda..af3b6c8a436 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c @@ -106,26 +106,25 @@ void kmemcheck_error_recall(void) switch (e->type) { case KMEMCHECK_ERROR_INVALID_ACCESS: - printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read " - "from %s memory (%p)\n", + printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n", 8 * e->size, e->state < ARRAY_SIZE(desc) ? desc[e->state] : "(invalid shadow state)", (void *) e->address); - printk(KERN_INFO); + printk(KERN_WARNING); for (i = 0; i < SHADOW_COPY_SIZE; ++i) - printk("%02x", e->memory_copy[i]); - printk("\n"); + printk(KERN_CONT "%02x", e->memory_copy[i]); + printk(KERN_CONT "\n"); - printk(KERN_INFO); + printk(KERN_WARNING); for (i = 0; i < SHADOW_COPY_SIZE; ++i) { if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) - printk(" %c", short_desc[e->shadow_copy[i]]); + printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]); else - printk(" ?"); + printk(KERN_CONT " ?"); } - printk("\n"); - printk(KERN_INFO "%*c\n", 2 + 2 + printk(KERN_CONT "\n"); + printk(KERN_WARNING "%*c\n", 2 + 2 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); break; case KMEMCHECK_ERROR_BUG: diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index 8cc18334414..b3b531a4f8e 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -337,7 +337,7 @@ bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) if (!shadow) return true; - status = kmemcheck_shadow_test(shadow, size); + status = kmemcheck_shadow_test_all(shadow, size); return status == KMEMCHECK_SHADOW_INITIALIZED; } diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c index 3f66b82076a..aec124214d9 100644 --- a/arch/x86/mm/kmemcheck/shadow.c +++ b/arch/x86/mm/kmemcheck/shadow.c @@ -125,12 +125,12 @@ void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n) enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) { +#ifdef CONFIG_KMEMCHECK_PARTIAL_OK uint8_t *x; unsigned int i; x = shadow; -#ifdef CONFIG_KMEMCHECK_PARTIAL_OK /* * Make sure _some_ bytes are initialized. Gcc frequently generates * code to access neighboring bytes. @@ -139,13 +139,25 @@ enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) return x[i]; } + + return x[0]; #else + return kmemcheck_shadow_test_all(shadow, size); +#endif +} + +enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size) +{ + uint8_t *x; + unsigned int i; + + x = shadow; + /* All bytes must be initialized. */ for (i = 0; i < size; ++i) { if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) return x[i]; } -#endif return x[0]; } diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h index af46d9ab9d8..ff0b2f70fbc 100644 --- a/arch/x86/mm/kmemcheck/shadow.h +++ b/arch/x86/mm/kmemcheck/shadow.h @@ -11,6 +11,8 @@ enum kmemcheck_shadow { void *kmemcheck_shadow_lookup(unsigned long address); enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); +enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, + unsigned int size); void kmemcheck_shadow_set(void *shadow, unsigned int size); #endif diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index c0f6198565e..5d0e67fff1a 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -21,6 +21,7 @@ #include <linux/kdebug.h> #include <linux/mutex.h> #include <linux/io.h> +#include <linux/slab.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <linux/errno.h> @@ -538,14 +539,15 @@ static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) { struct die_args *arg = args; + unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err); - if (val == DIE_DEBUG && (arg->err & DR_STEP)) - if (post_kmmio_handler(arg->err, arg->regs) == 1) { + if (val == DIE_DEBUG && (*dr6_p & DR_STEP)) + if (post_kmmio_handler(*dr6_p, arg->regs) == 1) { /* * Reset the BS bit in dr6 (pointed by args->err) to * denote completion of processing */ - (*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP; + *dr6_p &= ~DR_STEP; return NOTIFY_STOP; } diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index c8191defc38..1dab5194fd9 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -71,7 +71,7 @@ static int mmap_is_legacy(void) if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) + if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; @@ -96,7 +96,7 @@ static unsigned long mmap_rnd(void) static unsigned long mmap_base(void) { - unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; + unsigned long gap = rlimit(RLIMIT_STACK); if (gap < MIN_GAP) gap = MIN_GAP; diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 34a3291ca10..3adff7dcc14 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -26,6 +26,7 @@ #include <linux/module.h> #include <linux/debugfs.h> +#include <linux/slab.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/version.h> diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index b20760ca724..809baaaf48b 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -418,7 +418,10 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, for_each_online_node(nid) { memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + NODE_DATA(nid)->node_id = nid; +#ifndef CONFIG_NO_BOOTMEM NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; +#endif } setup_bootmem_allocator(); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 83bbc70d11b..8948f47fde0 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -163,30 +163,48 @@ static void * __init early_node_mem(int nodeid, unsigned long start, unsigned long end, unsigned long size, unsigned long align) { - unsigned long mem = find_e820_area(start, end, size, align); - void *ptr; + unsigned long mem; + /* + * put it on high as possible + * something will go with NODE_DATA + */ + if (start < (MAX_DMA_PFN<<PAGE_SHIFT)) + start = MAX_DMA_PFN<<PAGE_SHIFT; + if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) && + end > (MAX_DMA32_PFN<<PAGE_SHIFT)) + start = MAX_DMA32_PFN<<PAGE_SHIFT; + mem = find_e820_area(start, end, size, align); + if (mem != -1L) + return __va(mem); + + /* extend the search scope */ + end = max_pfn_mapped << PAGE_SHIFT; + if (end > (MAX_DMA32_PFN<<PAGE_SHIFT)) + start = MAX_DMA32_PFN<<PAGE_SHIFT; + else + start = MAX_DMA_PFN<<PAGE_SHIFT; + mem = find_e820_area(start, end, size, align); if (mem != -1L) return __va(mem); - ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); - if (ptr == NULL) { - printk(KERN_ERR "Cannot find %lu bytes in node %d\n", + printk(KERN_ERR "Cannot find %lu bytes in node %d\n", size, nodeid); - return NULL; - } - return ptr; + + return NULL; } /* Initialize bootmem allocator for a node */ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) { - unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; + unsigned long start_pfn, last_pfn, nodedata_phys; const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); - unsigned long bootmap_start, nodedata_phys; - void *bootmap; int nid; +#ifndef CONFIG_NO_BOOTMEM + unsigned long bootmap_start, bootmap_pages, bootmap_size; + void *bootmap; +#endif if (!end) return; @@ -200,7 +218,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) start = roundup(start, ZONE_ALIGN); - printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, + printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid, start, end); start_pfn = start >> PAGE_SHIFT; @@ -211,14 +229,21 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) if (node_data[nodeid] == NULL) return; nodedata_phys = __pa(node_data[nodeid]); + reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, nodedata_phys + pgdat_size - 1); + nid = phys_to_nid(nodedata_phys); + if (nid != nodeid) + printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); - NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; + NODE_DATA(nodeid)->node_id = nodeid; NODE_DATA(nodeid)->node_start_pfn = start_pfn; NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; +#ifndef CONFIG_NO_BOOTMEM + NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; + /* * Find a place for the bootmem map * nodedata_phys could be on other nodes by alloc_bootmem, @@ -227,11 +252,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) * of alloc_bootmem, that could clash with reserved range */ bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); - nid = phys_to_nid(nodedata_phys); - if (nid == nodeid) - bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); - else - bootmap_start = roundup(start, PAGE_SIZE); + bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); /* * SMP_CACHE_BYTES could be enough, but init_bootmem_node like * to use that to align to PAGE_SIZE @@ -239,18 +260,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) bootmap = early_node_mem(nodeid, bootmap_start, end, bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); if (bootmap == NULL) { - if (nodedata_phys < start || nodedata_phys >= end) { - /* - * only need to free it if it is from other node - * bootmem - */ - if (nid != nodeid) - free_bootmem(nodedata_phys, pgdat_size); - } + free_early(nodedata_phys, nodedata_phys + pgdat_size); node_data[nodeid] = NULL; return; } bootmap_start = __pa(bootmap); + reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT), + "BOOTMAP"); bootmap_size = init_bootmem_node(NODE_DATA(nodeid), bootmap_start >> PAGE_SHIFT, @@ -259,31 +275,12 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", bootmap_start, bootmap_start + bootmap_size - 1, bootmap_pages); - - free_bootmem_with_active_regions(nodeid, end); - - /* - * convert early reserve to bootmem reserve earlier - * otherwise early_node_mem could use early reserved mem - * on previous node - */ - early_res_to_bootmem(start, end); - - /* - * in some case early_node_mem could use alloc_bootmem - * to get range on other node, don't reserve that again - */ - if (nid != nodeid) - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); - else - reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, - pgdat_size, BOOTMEM_DEFAULT); nid = phys_to_nid(bootmap_start); if (nid != nodeid) printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); - else - reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, - bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); + + free_bootmem_with_active_regions(nodeid, end); +#endif node_set_online(nodeid); } @@ -427,7 +424,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, * Calculate the number of big nodes that can be allocated as a result * of consolidating the remainder. */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / FAKE_NODE_MIN_SIZE; size &= FAKE_NODE_MIN_HASH_MASK; @@ -502,77 +499,99 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, } /* - * Splits num_nodes nodes up equally starting at node_start. The return value - * is the number of nodes split up and addr is adjusted to be at the end of the - * last node allocated. + * Returns the end address of a node so that there is at least `size' amount of + * non-reserved memory or `max_addr' is reached. */ -static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, - int num_nodes) +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) { - unsigned int big; - u64 size; - int i; + u64 end = start + size; - if (num_nodes <= 0) - return -1; - if (num_nodes > MAX_NUMNODES) - num_nodes = MAX_NUMNODES; - size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / - num_nodes; - /* - * Calculate the number of big nodes that can be allocated as a result - * of consolidating the leftovers. - */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / - FAKE_NODE_MIN_SIZE; - - /* Round down to nearest FAKE_NODE_MIN_SIZE. */ - size &= FAKE_NODE_MIN_HASH_MASK; - if (!size) { - printk(KERN_ERR "Not enough memory for each node. " - "NUMA emulation disabled.\n"); - return -1; - } - - for (i = node_start; i < num_nodes + node_start; i++) { - u64 end = *addr + size; - - if (i < big) - end += FAKE_NODE_MIN_SIZE; - /* - * The final node can have the remaining system RAM. Other - * nodes receive roughly the same amount of available pages. - */ - if (i == num_nodes + node_start - 1) + while (end - start - e820_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { end = max_addr; - else - while (end - *addr - e820_hole_size(*addr, end) < - size) { - end += FAKE_NODE_MIN_SIZE; - if (end > max_addr) { - end = max_addr; - break; - } - } - if (setup_node_range(i, addr, end - *addr, max_addr) < 0) break; + } } - return i - node_start + 1; + return end; } /* - * Splits the remaining system RAM into chunks of size. The remaining memory is - * always assigned to a final node and can be asymmetric. Returns the number of - * nodes split. + * Sets up fake nodes of `size' interleaved over physical nodes ranging from + * `addr' to `max_addr'. The return value is the number of nodes allocated. */ -static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, - u64 size) +static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) { - int i = node_start; - size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; - while (!setup_node_range(i++, addr, size, max_addr)) - ; - return i - node_start; + nodemask_t physnode_mask = NODE_MASK_NONE; + u64 min_size; + int ret = 0; + int i; + + if (!size) + return -1; + /* + * The limit on emulated nodes is MAX_NUMNODES, so the size per node is + * increased accordingly if the requested size is too small. This + * creates a uniform distribution of node sizes across the entire + * machine (but not necessarily over physical nodes). + */ + min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) / + MAX_NUMNODES; + min_size = max(min_size, FAKE_NODE_MIN_SIZE); + if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) + min_size = (min_size + FAKE_NODE_MIN_SIZE) & + FAKE_NODE_MIN_HASH_MASK; + if (size < min_size) { + pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", + size >> 20, min_size >> 20); + size = min_size; + } + size &= FAKE_NODE_MIN_HASH_MASK; + + for (i = 0; i < MAX_NUMNODES; i++) + if (physnodes[i].start != physnodes[i].end) + node_set(i, physnode_mask); + /* + * Fill physical nodes with fake nodes of size until there is no memory + * left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; + u64 end; + + end = find_end_of_node(physnodes[i].start, + physnodes[i].end, size); + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (physnodes[i].end - end - + e820_hole_size(end, physnodes[i].end) < size) + end = physnodes[i].end; + + /* + * Setup the fake node that will be allocated as bootmem + * later. If setup_node_range() returns non-zero, there + * is no more memory available on this physical node. + */ + if (setup_node_range(ret++, &physnodes[i].start, + end - physnodes[i].start, + physnodes[i].end) < 0) + node_clear(i, physnode_mask); + } + } + return ret; } /* @@ -582,87 +601,32 @@ static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn, int acpi, int k8) { - u64 size, addr = start_pfn << PAGE_SHIFT; + u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT; - int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; int num_phys_nodes; + int num_nodes; + int i; num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); /* - * If the numa=fake command-line is just a single number N, split the - * system RAM into N fake nodes. + * If the numa=fake command-line contains a 'M' or 'G', it represents + * the fixed node size. Otherwise, if it is just a single number N, + * split the system RAM into N fake nodes. */ - if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { - long n = simple_strtol(cmdline, NULL, 0); - - num_nodes = split_nodes_interleave(addr, max_addr, - num_phys_nodes, n); - if (num_nodes < 0) - return num_nodes; - goto out; - } + if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { + u64 size; - /* Parse the command line. */ - for (coeff_flag = 0; ; cmdline++) { - if (*cmdline && isdigit(*cmdline)) { - num = num * 10 + *cmdline - '0'; - continue; - } - if (*cmdline == '*') { - if (num > 0) - coeff = num; - coeff_flag = 1; - } - if (!*cmdline || *cmdline == ',') { - if (!coeff_flag) - coeff = 1; - /* - * Round down to the nearest FAKE_NODE_MIN_SIZE. - * Command-line coefficients are in megabytes. - */ - size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; - if (size) - for (i = 0; i < coeff; i++, num_nodes++) - if (setup_node_range(num_nodes, &addr, - size, max_addr) < 0) - goto done; - if (!*cmdline) - break; - coeff_flag = 0; - coeff = -1; - } - num = 0; - } -done: - if (!num_nodes) - return -1; - /* Fill remainder of system RAM, if appropriate. */ - if (addr < max_addr) { - if (coeff_flag && coeff < 0) { - /* Split remaining nodes into num-sized chunks */ - num_nodes += split_nodes_by_size(&addr, max_addr, - num_nodes, num); - goto out; - } - switch (*(cmdline - 1)) { - case '*': - /* Split remaining nodes into coeff chunks */ - if (coeff <= 0) - break; - num_nodes += split_nodes_equally(&addr, max_addr, - num_nodes, coeff); - break; - case ',': - /* Do not allocate remaining system RAM */ - break; - default: - /* Give one final node */ - setup_node_range(num_nodes, &addr, max_addr - addr, - max_addr); - num_nodes++; - } + size = memparse(cmdline, &cmdline); + num_nodes = split_nodes_size_interleave(addr, max_addr, size); + } else { + unsigned long n; + + n = simple_strtoul(cmdline, NULL, 0); + num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); } -out: + + if (num_nodes < 0) + return num_nodes; memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); if (memnode_shift < 0) { memnode_shift = 0; @@ -742,6 +706,10 @@ unsigned long __init numa_free_all_bootmem(void) for_each_online_node(i) pages += free_all_bootmem_node(NODE_DATA(i)); +#ifdef CONFIG_NO_BOOTMEM + pages += free_all_memory_core_early(MAX_NUMNODES); +#endif + return pages; } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1d4eb93d333..28195c350b9 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -6,13 +6,13 @@ #include <linux/bootmem.h> #include <linux/module.h> #include <linux/sched.h> -#include <linux/slab.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/pfn.h> #include <linux/percpu.h> +#include <linux/gfp.h> #include <asm/e820.h> #include <asm/processor.h> @@ -291,8 +291,29 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, */ if (kernel_set_to_readonly && within(address, (unsigned long)_text, - (unsigned long)__end_rodata_hpage_align)) - pgprot_val(forbidden) |= _PAGE_RW; + (unsigned long)__end_rodata_hpage_align)) { + unsigned int level; + + /* + * Don't enforce the !RW mapping for the kernel text mapping, + * if the current mapping is already using small page mapping. + * No need to work hard to preserve large page mappings in this + * case. + * + * This also fixes the Linux Xen paravirt guest boot failure + * (because of unexpected read-only mappings for kernel identity + * mappings). In this paravirt guest case, the kernel text + * mapping and the kernel identity mapping share the same + * page-table pages. Thus we can't really use different + * protections for the kernel text and identity mappings. Also, + * these shared mappings are made of small page mappings. + * Thus this don't enforce !RW mapping for small page kernel + * text mapping logic will help Linux Xen parvirt guest boot + * aswell. + */ + if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) + pgprot_val(forbidden) |= _PAGE_RW; + } #endif prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index ae9648eb1c7..edc8b95afc1 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -12,7 +12,7 @@ #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/gfp.h> +#include <linux/slab.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/rbtree.h> diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ed34f5e3599..5c4ee422590 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -1,4 +1,5 @@ #include <linux/mm.h> +#include <linux/gfp.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/tlb.h> @@ -6,6 +7,14 @@ #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO +#ifdef CONFIG_HIGHPTE +#define PGALLOC_USER_GFP __GFP_HIGHMEM +#else +#define PGALLOC_USER_GFP 0 +#endif + +gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; + pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { return (pte_t *)__get_free_page(PGALLOC_GFP); @@ -15,16 +24,29 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *pte; -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0); -#else - pte = alloc_pages(PGALLOC_GFP, 0); -#endif + pte = alloc_pages(__userpte_alloc_gfp, 0); if (pte) pgtable_page_ctor(pte); return pte; } +static int __init setup_userpte(char *arg) +{ + if (!arg) + return -EINVAL; + + /* + * "userpte=nohigh" disables allocation of user pagetables in + * high memory. + */ + if (strcmp(arg, "nohigh") == 0) + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + else + return -EINVAL; + return 0; +} +early_param("userpte", setup_userpte); + void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 46c8834aedc..1a8faf09afe 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -6,7 +6,6 @@ #include <linux/swap.h> #include <linux/smp.h> #include <linux/highmem.h> -#include <linux/slab.h> #include <linux/pagemap.h> #include <linux/spinlock.h> #include <linux/module.h> diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index a27124185fc..28c68762648 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -229,9 +229,11 @@ update_nodes_add(int node, unsigned long start, unsigned long end) printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); } - if (changed) + if (changed) { + node_set(node, cpu_nodes_parsed); printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); + } } /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 65b58e4b0b8..426f3a1a64d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -41,7 +41,7 @@ union smp_flush_state { struct { struct mm_struct *flush_mm; unsigned long flush_va; - spinlock_t tlbstate_lock; + raw_spinlock_t tlbstate_lock; DECLARE_BITMAP(flush_cpumask, NR_CPUS); }; char pad[INTERNODE_CACHE_BYTES]; @@ -181,7 +181,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is * probably not worth checking this for a cache-hot lock. */ - spin_lock(&f->tlbstate_lock); + raw_spin_lock(&f->tlbstate_lock); f->flush_mm = mm; f->flush_va = va; @@ -199,7 +199,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, f->flush_mm = NULL; f->flush_va = 0; - spin_unlock(&f->tlbstate_lock); + raw_spin_unlock(&f->tlbstate_lock); } void native_flush_tlb_others(const struct cpumask *cpumask, @@ -223,7 +223,7 @@ static int __cpuinit init_smp_flush(void) int i; for (i = 0; i < ARRAY_SIZE(flush_state); i++) - spin_lock_init(&flush_state[i].tlbstate_lock); + raw_spin_lock_init(&flush_state[i].tlbstate_lock); return 0; } diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index cb88b1a0bd5..2c505ee7101 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -159,7 +159,7 @@ static int nmi_setup_mux(void) for_each_possible_cpu(i) { per_cpu(cpu_msrs, i).multiplex = - kmalloc(multiplex_size, GFP_KERNEL); + kzalloc(multiplex_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).multiplex) return 0; } @@ -179,7 +179,6 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) if (counter_config[i].enabled) { multiplex[i].saved = -(u64)counter_config[i].count; } else { - multiplex[i].addr = 0; multiplex[i].saved = 0; } } @@ -189,25 +188,27 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) { + struct op_msr *counters = msrs->counters; struct op_msr *multiplex = msrs->multiplex; int i; for (i = 0; i < model->num_counters; ++i) { int virt = op_x86_phys_to_virt(i); - if (multiplex[virt].addr) - rdmsrl(multiplex[virt].addr, multiplex[virt].saved); + if (counters[i].addr) + rdmsrl(counters[i].addr, multiplex[virt].saved); } } static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) { + struct op_msr *counters = msrs->counters; struct op_msr *multiplex = msrs->multiplex; int i; for (i = 0; i < model->num_counters; ++i) { int virt = op_x86_phys_to_virt(i); - if (multiplex[virt].addr) - wrmsrl(multiplex[virt].addr, multiplex[virt].saved); + if (counters[i].addr) + wrmsrl(counters[i].addr, multiplex[virt].saved); } } @@ -222,7 +223,7 @@ static void nmi_cpu_switch(void *dummy) /* move to next set */ si += model->num_counters; - if ((si > model->num_virt_counters) || (counter_config[si].count == 0)) + if ((si >= model->num_virt_counters) || (counter_config[si].count == 0)) per_cpu(switch_index, cpu) = 0; else per_cpu(switch_index, cpu) = si; @@ -303,11 +304,11 @@ static int allocate_msrs(void) int i; for_each_possible_cpu(i) { - per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, + per_cpu(cpu_msrs, i).counters = kzalloc(counters_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).counters) return 0; - per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, + per_cpu(cpu_msrs, i).controls = kzalloc(controls_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).controls) return 0; @@ -598,6 +599,7 @@ static int __init ppro_init(char **cpu_type) case 15: case 23: *cpu_type = "i386/core_2"; break; + case 0x2e: case 26: spec = &op_arch_perfmon_spec; *cpu_type = "i386/core_i7"; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 39686c29f03..090cbbec7db 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -22,6 +22,9 @@ #include <asm/ptrace.h> #include <asm/msr.h> #include <asm/nmi.h> +#include <asm/apic.h> +#include <asm/processor.h> +#include <asm/cpufeature.h> #include "op_x86_model.h" #include "op_counter.h" @@ -43,23 +46,10 @@ static unsigned long reset_value[NUM_VIRT_COUNTERS]; -#ifdef CONFIG_OPROFILE_IBS - -/* IbsFetchCtl bits/masks */ -#define IBS_FETCH_RAND_EN (1ULL<<57) -#define IBS_FETCH_VAL (1ULL<<49) -#define IBS_FETCH_ENABLE (1ULL<<48) -#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL - -/*IbsOpCtl bits */ -#define IBS_OP_CNT_CTL (1ULL<<19) -#define IBS_OP_VAL (1ULL<<18) -#define IBS_OP_ENABLE (1ULL<<17) - #define IBS_FETCH_SIZE 6 #define IBS_OP_SIZE 12 -static int has_ibs; /* AMD Family10h and later */ +static u32 ibs_caps; struct op_ibs_config { unsigned long op_enabled; @@ -71,24 +61,52 @@ struct op_ibs_config { }; static struct op_ibs_config ibs_config; +static u64 ibs_op_ctl; -#endif +/* + * IBS cpuid feature detection + */ -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX +#define IBS_CPUID_FEATURES 0x8000001b + +/* + * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but + * bit 0 is used to indicate the existence of IBS. + */ +#define IBS_CAPS_AVAIL (1LL<<0) +#define IBS_CAPS_RDWROPCNT (1LL<<3) +#define IBS_CAPS_OPCNT (1LL<<4) -static void op_mux_fill_in_addresses(struct op_msrs * const msrs) +/* + * IBS randomization macros + */ +#define IBS_RANDOM_BITS 12 +#define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1) +#define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5)) + +static u32 get_ibs_caps(void) { - int i; + u32 ibs_caps; + unsigned int max_level; - for (i = 0; i < NUM_VIRT_COUNTERS; i++) { - int hw_counter = op_x86_virt_to_phys(i); - if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; - else - msrs->multiplex[i].addr = 0; - } + if (!boot_cpu_has(X86_FEATURE_IBS)) + return 0; + + /* check IBS cpuid feature flags */ + max_level = cpuid_eax(0x80000000); + if (max_level < IBS_CPUID_FEATURES) + return IBS_CAPS_AVAIL; + + ibs_caps = cpuid_eax(IBS_CPUID_FEATURES); + if (!(ibs_caps & IBS_CAPS_AVAIL)) + /* cpuid flags not valid */ + return IBS_CAPS_AVAIL; + + return ibs_caps; } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, struct op_msrs const * const msrs) { @@ -98,7 +116,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { int virt = op_x86_phys_to_virt(i); - if (!counter_config[virt].enabled) + if (!reset_value[virt]) continue; rdmsrl(msrs->controls[i].addr, val); val &= model->reserved; @@ -107,10 +125,6 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, } } -#else - -static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { } - #endif /* functions for op_amd_spec */ @@ -122,18 +136,12 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) for (i = 0; i < NUM_COUNTERS; i++) { if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; - else - msrs->counters[i].addr = 0; } for (i = 0; i < NUM_CONTROLS; i++) { if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; - else - msrs->controls[i].addr = 0; } - - op_mux_fill_in_addresses(msrs); } static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, @@ -144,7 +152,8 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* setup reset_value */ for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { - if (counter_config[i].enabled) + if (counter_config[i].enabled + && msrs->counters[op_x86_virt_to_phys(i)].addr) reset_value[i] = counter_config[i].count; else reset_value[i] = 0; @@ -152,9 +161,18 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* clear all counters */ for (i = 0; i < NUM_CONTROLS; ++i) { - if (unlikely(!msrs->controls[i].addr)) + if (unlikely(!msrs->controls[i].addr)) { + if (counter_config[i].enabled && !smp_processor_id()) + /* + * counter is reserved, this is on all + * cpus, so report only for cpu #0 + */ + op_x86_warn_reserved(i); continue; + } rdmsrl(msrs->controls[i].addr, val); + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) + op_x86_warn_in_use(i); val &= model->reserved; wrmsrl(msrs->controls[i].addr, val); } @@ -169,9 +187,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { int virt = op_x86_phys_to_virt(i); - if (!counter_config[virt].enabled) - continue; - if (!msrs->counters[i].addr) + if (!reset_value[virt]) continue; /* setup counter registers */ @@ -185,7 +201,60 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, } } -#ifdef CONFIG_OPROFILE_IBS +/* + * 16-bit Linear Feedback Shift Register (LFSR) + * + * 16 14 13 11 + * Feedback polynomial = X + X + X + X + 1 + */ +static unsigned int lfsr_random(void) +{ + static unsigned int lfsr_value = 0xF00D; + unsigned int bit; + + /* Compute next bit to shift in */ + bit = ((lfsr_value >> 0) ^ + (lfsr_value >> 2) ^ + (lfsr_value >> 3) ^ + (lfsr_value >> 5)) & 0x0001; + + /* Advance to next register value */ + lfsr_value = (lfsr_value >> 1) | (bit << 15); + + return lfsr_value; +} + +/* + * IBS software randomization + * + * The IBS periodic op counter is randomized in software. The lower 12 + * bits of the 20 bit counter are randomized. IbsOpCurCnt is + * initialized with a 12 bit random value. + */ +static inline u64 op_amd_randomize_ibs_op(u64 val) +{ + unsigned int random = lfsr_random(); + + if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) + /* + * Work around if the hw can not write to IbsOpCurCnt + * + * Randomize the lower 8 bits of the 16 bit + * IbsOpMaxCnt [15:0] value in the range of -128 to + * +127 by adding/subtracting an offset to the + * maximum count (IbsOpMaxCnt). + * + * To avoid over or underflows and protect upper bits + * starting at bit 16, the initial value for + * IbsOpMaxCnt must fit in the range from 0x0081 to + * 0xff80. + */ + val += (s8)(random >> 4); + else + val |= (u64)(random & IBS_RANDOM_MASK) << 32; + + return val; +} static inline void op_amd_handle_ibs(struct pt_regs * const regs, @@ -194,7 +263,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, u64 val, ctl; struct op_entry entry; - if (!has_ibs) + if (!ibs_caps) return; if (ibs_config.fetch_enabled) { @@ -210,7 +279,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, oprofile_write_commit(&entry); /* reenable the IRQ */ - ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK); + ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT); ctl |= IBS_FETCH_ENABLE; wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); } @@ -236,8 +305,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, oprofile_write_commit(&entry); /* reenable the IRQ */ - ctl &= ~IBS_OP_VAL & 0xFFFFFFFF; - ctl |= IBS_OP_ENABLE; + ctl = op_amd_randomize_ibs_op(ibs_op_ctl); wrmsrl(MSR_AMD64_IBSOPCTL, ctl); } } @@ -246,41 +314,57 @@ op_amd_handle_ibs(struct pt_regs * const regs, static inline void op_amd_start_ibs(void) { u64 val; - if (has_ibs && ibs_config.fetch_enabled) { - val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + + if (!ibs_caps) + return; + + if (ibs_config.fetch_enabled) { + val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT; val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; val |= IBS_FETCH_ENABLE; wrmsrl(MSR_AMD64_IBSFETCHCTL, val); } - if (has_ibs && ibs_config.op_enabled) { - val = (ibs_config.max_cnt_op >> 4) & 0xFFFF; - val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; - val |= IBS_OP_ENABLE; + if (ibs_config.op_enabled) { + ibs_op_ctl = ibs_config.max_cnt_op >> 4; + if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) { + /* + * IbsOpCurCnt not supported. See + * op_amd_randomize_ibs_op() for details. + */ + ibs_op_ctl = clamp(ibs_op_ctl, 0x0081ULL, 0xFF80ULL); + } else { + /* + * The start value is randomized with a + * positive offset, we need to compensate it + * with the half of the randomized range. Also + * avoid underflows. + */ + ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET, + IBS_OP_MAX_CNT); + } + if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops) + ibs_op_ctl |= IBS_OP_CNT_CTL; + ibs_op_ctl |= IBS_OP_ENABLE; + val = op_amd_randomize_ibs_op(ibs_op_ctl); wrmsrl(MSR_AMD64_IBSOPCTL, val); } } static void op_amd_stop_ibs(void) { - if (has_ibs && ibs_config.fetch_enabled) + if (!ibs_caps) + return; + + if (ibs_config.fetch_enabled) /* clear max count and enable */ wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); - if (has_ibs && ibs_config.op_enabled) + if (ibs_config.op_enabled) /* clear max count and enable */ wrmsrl(MSR_AMD64_IBSOPCTL, 0); } -#else - -static inline void op_amd_handle_ibs(struct pt_regs * const regs, - struct op_msrs const * const msrs) { } -static inline void op_amd_start_ibs(void) { } -static inline void op_amd_stop_ibs(void) { } - -#endif - static int op_amd_check_ctrs(struct pt_regs * const regs, struct op_msrs const * const msrs) { @@ -314,7 +398,7 @@ static void op_amd_start(struct op_msrs const * const msrs) if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } @@ -334,7 +418,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } @@ -355,8 +439,6 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) } } -#ifdef CONFIG_OPROFILE_IBS - static u8 ibs_eilvt_off; static inline void apic_init_ibs_nmi_per_cpu(void *arg) @@ -405,45 +487,36 @@ static int init_ibs_nmi(void) return 1; } -#ifdef CONFIG_NUMA - /* Sanity check */ - /* Works only for 64bit with proper numa implementation. */ - if (nodes != num_possible_nodes()) { - printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, " - "found: %d, expected %d", - nodes, num_possible_nodes()); - return 1; - } -#endif return 0; } /* uninitialize the APIC for the IBS interrupts if needed */ static void clear_ibs_nmi(void) { - if (has_ibs) + if (ibs_caps) on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); } /* initialize the APIC for the IBS interrupts if available */ static void ibs_init(void) { - has_ibs = boot_cpu_has(X86_FEATURE_IBS); + ibs_caps = get_ibs_caps(); - if (!has_ibs) + if (!ibs_caps) return; if (init_ibs_nmi()) { - has_ibs = 0; + ibs_caps = 0; return; } - printk(KERN_INFO "oprofile: AMD IBS detected\n"); + printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", + (unsigned)ibs_caps); } static void ibs_exit(void) { - if (!has_ibs) + if (!ibs_caps) return; clear_ibs_nmi(); @@ -463,7 +536,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) if (ret) return ret; - if (!has_ibs) + if (!ibs_caps) return ret; /* model specific files */ @@ -473,7 +546,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) ibs_config.fetch_enabled = 0; ibs_config.max_cnt_op = 250000; ibs_config.op_enabled = 0; - ibs_config.dispatched_ops = 1; + ibs_config.dispatched_ops = 0; dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); oprofilefs_create_ulong(sb, dir, "enable", @@ -488,8 +561,9 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) &ibs_config.op_enabled); oprofilefs_create_ulong(sb, dir, "max_count", &ibs_config.max_cnt_op); - oprofilefs_create_ulong(sb, dir, "dispatched_ops", - &ibs_config.dispatched_ops); + if (ibs_caps & IBS_CAPS_OPCNT) + oprofilefs_create_ulong(sb, dir, "dispatched_ops", + &ibs_config.dispatched_ops); return 0; } @@ -507,19 +581,6 @@ static void op_amd_exit(void) ibs_exit(); } -#else - -/* no IBS support */ - -static int op_amd_init(struct oprofile_operations *ops) -{ - return 0; -} - -static void op_amd_exit(void) {} - -#endif /* CONFIG_OPROFILE_IBS */ - struct op_x86_model_spec op_amd_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index ac6b354becd..e6a160a4684 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -394,12 +394,6 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs) setup_num_counters(); stag = get_stagger(); - /* initialize some registers */ - for (i = 0; i < num_counters; ++i) - msrs->counters[i].addr = 0; - for (i = 0; i < num_controls; ++i) - msrs->controls[i].addr = 0; - /* the counter & cccr registers we pay attention to */ for (i = 0; i < num_counters; ++i) { addr = p4_counters[VIRT_CTR(stag, i)].counter_address; diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 8eb05878554..2bf90fafa7b 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -37,15 +37,11 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs) for (i = 0; i < num_counters; i++) { if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; - else - msrs->counters[i].addr = 0; } for (i = 0; i < num_counters; i++) { if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; - else - msrs->controls[i].addr = 0; } } @@ -57,7 +53,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, int i; if (!reset_value) { - reset_value = kmalloc(sizeof(reset_value[0]) * num_counters, + reset_value = kzalloc(sizeof(reset_value[0]) * num_counters, GFP_ATOMIC); if (!reset_value) return; @@ -82,9 +78,18 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, /* clear all counters */ for (i = 0; i < num_counters; ++i) { - if (unlikely(!msrs->controls[i].addr)) + if (unlikely(!msrs->controls[i].addr)) { + if (counter_config[i].enabled && !smp_processor_id()) + /* + * counter is reserved, this is on all + * cpus, so report only for cpu #0 + */ + op_x86_warn_reserved(i); continue; + } rdmsrl(msrs->controls[i].addr, val); + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) + op_x86_warn_in_use(i); val &= model->reserved; wrmsrl(msrs->controls[i].addr, val); } @@ -161,7 +166,7 @@ static void ppro_start(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { rdmsrl(msrs->controls[i].addr, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } } @@ -179,7 +184,7 @@ static void ppro_stop(struct op_msrs const * const msrs) if (!reset_value[i]) continue; rdmsrl(msrs->controls[i].addr, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } } diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 7b8e75d1608..ff82a755edd 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -57,6 +57,26 @@ struct op_x86_model_spec { struct op_counter_config; +static inline void op_x86_warn_in_use(int counter) +{ + /* + * The warning indicates an already running counter. If + * oprofile doesn't collect data, then try using a different + * performance counter on your platform to monitor the desired + * event. Delete counter #%d from the desired event by editing + * the /usr/share/oprofile/%s/<cpu>/events file. If the event + * cannot be monitored by any other counter, contact your + * hardware or BIOS vendor. + */ + pr_warning("oprofile: counter #%d on cpu #%d may already be used\n", + counter, smp_processor_id()); +} + +static inline void op_x86_warn_reserved(int counter) +{ + pr_warning("oprofile: counter #%d is already reserved\n", counter); +} + extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, struct op_counter_config *counter_config); extern int op_x86_phys_to_virt(int phys); diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 564b008a51c..b110d97fb92 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -13,9 +13,10 @@ obj-$(CONFIG_X86_VISWS) += visws.o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o +obj-$(CONFIG_X86_MRST) += mrst.o + obj-y += common.o early.o -obj-y += amd_bus.o -obj-$(CONFIG_X86_64) += bus_numa.o intel_bus.o +obj-y += amd_bus.o bus_numa.o ifeq ($(CONFIG_PCI_DEBUG),y) EXTRA_CFLAGS += -DDEBUG diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 959e548a703..44f83ce0247 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -3,6 +3,7 @@ #include <linux/init.h> #include <linux/irq.h> #include <linux/dmi.h> +#include <linux/slab.h> #include <asm/numa.h> #include <asm/pci_x86.h> @@ -15,19 +16,94 @@ struct pci_root_info { int busnum; }; +static bool pci_use_crs = true; + +static int __init set_use_crs(const struct dmi_system_id *id) +{ + pci_use_crs = true; + return 0; +} + +static const struct dmi_system_id pci_use_crs_table[] __initconst = { + /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */ + { + .callback = set_use_crs, + .ident = "IBM System x3800", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "x3800"), + }, + }, + {} +}; + +void __init pci_acpi_crs_quirks(void) +{ + int year; + + if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008) + pci_use_crs = false; + + dmi_check_system(pci_use_crs_table); + + /* + * If the user specifies "pci=use_crs" or "pci=nocrs" explicitly, that + * takes precedence over anything we figured out above. + */ + if (pci_probe & PCI_ROOT_NO_CRS) + pci_use_crs = false; + else if (pci_probe & PCI_USE__CRS) + pci_use_crs = true; + + printk(KERN_INFO "PCI: %s host bridge windows from ACPI; " + "if necessary, use \"pci=%s\" and report a bug\n", + pci_use_crs ? "Using" : "Ignoring", + pci_use_crs ? "nocrs" : "use_crs"); +} + static acpi_status resource_to_addr(struct acpi_resource *resource, struct acpi_resource_address64 *addr) { acpi_status status; - - status = acpi_resource_to_address64(resource, addr); - if (ACPI_SUCCESS(status) && - (addr->resource_type == ACPI_MEMORY_RANGE || - addr->resource_type == ACPI_IO_RANGE) && - addr->address_length > 0 && - addr->producer_consumer == ACPI_PRODUCER) { + struct acpi_resource_memory24 *memory24; + struct acpi_resource_memory32 *memory32; + struct acpi_resource_fixed_memory32 *fixed_memory32; + + memset(addr, 0, sizeof(*addr)); + switch (resource->type) { + case ACPI_RESOURCE_TYPE_MEMORY24: + memory24 = &resource->data.memory24; + addr->resource_type = ACPI_MEMORY_RANGE; + addr->minimum = memory24->minimum; + addr->address_length = memory24->address_length; + addr->maximum = addr->minimum + addr->address_length - 1; return AE_OK; + case ACPI_RESOURCE_TYPE_MEMORY32: + memory32 = &resource->data.memory32; + addr->resource_type = ACPI_MEMORY_RANGE; + addr->minimum = memory32->minimum; + addr->address_length = memory32->address_length; + addr->maximum = addr->minimum + addr->address_length - 1; + return AE_OK; + case ACPI_RESOURCE_TYPE_FIXED_MEMORY32: + fixed_memory32 = &resource->data.fixed_memory32; + addr->resource_type = ACPI_MEMORY_RANGE; + addr->minimum = fixed_memory32->address; + addr->address_length = fixed_memory32->address_length; + addr->maximum = addr->minimum + addr->address_length - 1; + return AE_OK; + case ACPI_RESOURCE_TYPE_ADDRESS16: + case ACPI_RESOURCE_TYPE_ADDRESS32: + case ACPI_RESOURCE_TYPE_ADDRESS64: + status = acpi_resource_to_address64(resource, addr); + if (ACPI_SUCCESS(status) && + (addr->resource_type == ACPI_MEMORY_RANGE || + addr->resource_type == ACPI_IO_RANGE) && + addr->address_length > 0) { + return AE_OK; + } + break; } return AE_ERROR; } @@ -45,20 +121,6 @@ count_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; } -static int -bus_has_transparent_bridge(struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - u16 class = dev->class >> 8; - - if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent) - return true; - } - return false; -} - static void align_resource(struct acpi_device *bridge, struct resource *res) { @@ -91,12 +153,8 @@ setup_resource(struct acpi_resource *acpi_res, void *data) struct acpi_resource_address64 addr; acpi_status status; unsigned long flags; - struct resource *root; - int max_root_bus_resources = PCI_BUS_NUM_RESOURCES; - u64 start, end; - - if (bus_has_transparent_bridge(info->bus)) - max_root_bus_resources -= 3; + struct resource *root, *conflict; + u64 start, end, max_len; status = resource_to_addr(acpi_res, &addr); if (!ACPI_SUCCESS(status)) @@ -113,17 +171,19 @@ setup_resource(struct acpi_resource *acpi_res, void *data) } else return AE_OK; + max_len = addr.maximum - addr.minimum + 1; + if (addr.address_length > max_len) { + dev_printk(KERN_DEBUG, &info->bridge->dev, + "host bridge window length %#llx doesn't fit in " + "%#llx-%#llx, trimming\n", + (unsigned long long) addr.address_length, + (unsigned long long) addr.minimum, + (unsigned long long) addr.maximum); + addr.address_length = max_len; + } + start = addr.minimum + addr.translation_offset; end = start + addr.address_length - 1; - if (info->res_num >= max_root_bus_resources) { - if (pci_probe & PCI_USE__CRS) - printk(KERN_WARNING "PCI: Failed to allocate " - "0x%lx-0x%lx from %s for %s due to _CRS " - "returning more than %d resource descriptors\n", - (unsigned long) start, (unsigned long) end, - root->name, info->name, max_root_bus_resources); - return AE_OK; - } res = &info->res[info->res_num]; res->name = info->name; @@ -133,17 +193,20 @@ setup_resource(struct acpi_resource *acpi_res, void *data) res->child = NULL; align_resource(info->bridge, res); - if (!(pci_probe & PCI_USE__CRS)) { + if (!pci_use_crs) { dev_printk(KERN_DEBUG, &info->bridge->dev, "host bridge window %pR (ignored)\n", res); return AE_OK; } - if (insert_resource(root, res)) { + conflict = insert_resource_conflict(root, res); + if (conflict) { dev_err(&info->bridge->dev, - "can't allocate host bridge window %pR\n", res); + "address space collision: host bridge window %pR " + "conflicts with %s %pR\n", + res, conflict->name, conflict); } else { - info->bus->resource[info->res_num] = res; + pci_bus_add_resource(info->bus, res, 0); info->res_num++; if (addr.translation_offset) dev_info(&info->bridge->dev, "host bridge window %pR " @@ -164,10 +227,8 @@ get_current_resources(struct acpi_device *device, int busnum, struct pci_root_info info; size_t size; - if (!(pci_probe & PCI_USE__CRS)) - dev_info(&device->dev, - "ignoring host bridge windows from ACPI; " - "boot with \"pci=use_crs\" to use them\n"); + if (pci_use_crs) + pci_bus_remove_resources(bus); info.bridge = device; info.bus = bus; @@ -282,17 +343,14 @@ int __init pci_acpi_init(void) { struct pci_dev *dev = NULL; - if (pcibios_scanned) - return 0; - if (acpi_noirq) - return 0; + return -ENODEV; printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n"); acpi_irq_penalty_init(); - pcibios_scanned++; pcibios_enable_irq = acpi_pci_irq_enable; pcibios_disable_irq = acpi_pci_irq_disable; + x86_init.pci.init_irq = x86_init_noop; if (pci_routeirq) { /* diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 95ecbd49595..fc1e8fe07e5 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -2,11 +2,11 @@ #include <linux/pci.h> #include <linux/topology.h> #include <linux/cpu.h> +#include <linux/range.h> + #include <asm/pci_x86.h> -#ifdef CONFIG_X86_64 #include <asm/pci-direct.h> -#endif #include "bus_numa.h" @@ -15,60 +15,6 @@ * also get peer root bus resource for io,mmio */ -#ifdef CONFIG_X86_64 - -#define RANGE_NUM 16 - -struct res_range { - size_t start; - size_t end; -}; - -static void __init update_range(struct res_range *range, size_t start, - size_t end) -{ - int i; - int j; - - for (j = 0; j < RANGE_NUM; j++) { - if (!range[j].end) - continue; - - if (start <= range[j].start && end >= range[j].end) { - range[j].start = 0; - range[j].end = 0; - continue; - } - - if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) { - range[j].start = end + 1; - continue; - } - - - if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) { - range[j].end = start - 1; - continue; - } - - if (start > range[j].start && end < range[j].end) { - /* find the new spare */ - for (i = 0; i < RANGE_NUM; i++) { - if (range[i].end == 0) - break; - } - if (i < RANGE_NUM) { - range[i].end = range[j].end; - range[i].start = end + 1; - } else { - printk(KERN_ERR "run of slot in ranges\n"); - } - range[j].end = start - 1; - continue; - } - } -} - struct pci_hostbridge_probe { u32 bus; u32 slot; @@ -111,6 +57,8 @@ static void __init get_pci_mmcfg_amd_fam10h_range(void) fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1; } +#define RANGE_NUM 16 + /** * early_fill_mp_bus_to_node() * called before pcibios_scan_root and pci_scan_bus @@ -130,16 +78,17 @@ static int __init early_fill_mp_bus_info(void) struct pci_root_info *info; u32 reg; struct resource *res; - size_t start; - size_t end; - struct res_range range[RANGE_NUM]; + u64 start; + u64 end; + struct range range[RANGE_NUM]; u64 val; u32 address; + bool found; if (!early_pci_allowed()) return -1; - found_all_numa_early = 0; + found = false; for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { u32 id; u16 device; @@ -153,12 +102,12 @@ static int __init early_fill_mp_bus_info(void) device = (id>>16) & 0xffff; if (pci_probes[i].vendor == vendor && pci_probes[i].device == device) { - found_all_numa_early = 1; + found = true; break; } } - if (!found_all_numa_early) + if (!found) return 0; pci_root_num = 0; @@ -196,7 +145,7 @@ static int __init early_fill_mp_bus_info(void) def_link = (reg >> 8) & 0x03; memset(range, 0, sizeof(range)); - range[0].end = 0xffff; + add_range(range, RANGE_NUM, 0, 0, 0xffff + 1); /* io port resource */ for (i = 0; i < 4; i++) { reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3)); @@ -220,13 +169,13 @@ static int __init early_fill_mp_bus_info(void) info = &pci_root_info[j]; printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n", - node, link, (u64)start, (u64)end); + node, link, start, end); /* kernel only handle 16 bit only */ if (end > 0xffff) end = 0xffff; update_res(info, start, end, IORESOURCE_IO, 1); - update_range(range, start, end); + subtract_range(range, RANGE_NUM, start, end + 1); } /* add left over io port range to def node/link, [0, 0xffff] */ /* find the position */ @@ -241,29 +190,32 @@ static int __init early_fill_mp_bus_info(void) if (!range[i].end) continue; - update_res(info, range[i].start, range[i].end, + update_res(info, range[i].start, range[i].end - 1, IORESOURCE_IO, 1); } } memset(range, 0, sizeof(range)); /* 0xfd00000000-0xffffffffff for HT */ - range[0].end = (0xfdULL<<32) - 1; + end = cap_resource((0xfdULL<<32) - 1); + end++; + add_range(range, RANGE_NUM, 0, 0, end); /* need to take out [0, TOM) for RAM*/ address = MSR_K8_TOP_MEM1; rdmsrl(address, val); end = (val & 0xffffff800000ULL); - printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); + printk(KERN_INFO "TOM: %016llx aka %lldM\n", end, end>>20); if (end < (1ULL<<32)) - update_range(range, 0, end - 1); + subtract_range(range, RANGE_NUM, 0, end); /* get mmconfig */ get_pci_mmcfg_amd_fam10h_range(); /* need to take out mmconf range */ if (fam10h_mmconf_end) { printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); - update_range(range, fam10h_mmconf_start, fam10h_mmconf_end); + subtract_range(range, RANGE_NUM, fam10h_mmconf_start, + fam10h_mmconf_end + 1); } /* mmio resource */ @@ -293,7 +245,7 @@ static int __init early_fill_mp_bus_info(void) info = &pci_root_info[j]; printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]", - node, link, (u64)start, (u64)end); + node, link, start, end); /* * some sick allocation would have range overlap with fam10h * mmconf range, so need to update start and end. @@ -318,14 +270,15 @@ static int __init early_fill_mp_bus_info(void) /* we got a hole */ endx = fam10h_mmconf_start - 1; update_res(info, start, endx, IORESOURCE_MEM, 0); - update_range(range, start, endx); - printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx); + subtract_range(range, RANGE_NUM, start, + endx + 1); + printk(KERN_CONT " ==> [%llx, %llx]", start, endx); start = fam10h_mmconf_end + 1; changed = 1; } if (changed) { if (start <= end) { - printk(KERN_CONT " %s [%llx, %llx]", endx?"and":"==>", (u64)start, (u64)end); + printk(KERN_CONT " %s [%llx, %llx]", endx ? "and" : "==>", start, end); } else { printk(KERN_CONT "%s\n", endx?"":" ==> none"); continue; @@ -333,8 +286,9 @@ static int __init early_fill_mp_bus_info(void) } } - update_res(info, start, end, IORESOURCE_MEM, 1); - update_range(range, start, end); + update_res(info, cap_resource(start), cap_resource(end), + IORESOURCE_MEM, 1); + subtract_range(range, RANGE_NUM, start, end + 1); printk(KERN_CONT "\n"); } @@ -348,8 +302,8 @@ static int __init early_fill_mp_bus_info(void) address = MSR_K8_TOP_MEM2; rdmsrl(address, val); end = (val & 0xffffff800000ULL); - printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); - update_range(range, 1ULL<<32, end - 1); + printk(KERN_INFO "TOM2: %016llx aka %lldM\n", end, end>>20); + subtract_range(range, RANGE_NUM, 1ULL<<32, end); } /* @@ -368,7 +322,8 @@ static int __init early_fill_mp_bus_info(void) if (!range[i].end) continue; - update_res(info, range[i].start, range[i].end, + update_res(info, cap_resource(range[i].start), + cap_resource(range[i].end - 1), IORESOURCE_MEM, 1); } } @@ -384,24 +339,14 @@ static int __init early_fill_mp_bus_info(void) info->bus_min, info->bus_max, info->node, info->link); for (j = 0; j < res_num; j++) { res = &info->res[j]; - printk(KERN_DEBUG "bus: %02x index %x %s: [%llx, %llx]\n", - busnum, j, - (res->flags & IORESOURCE_IO)?"io port":"mmio", - res->start, res->end); + printk(KERN_DEBUG "bus: %02x index %x %pR\n", + busnum, j, res); } } return 0; } -#else /* !CONFIG_X86_64 */ - -static int __init early_fill_mp_bus_info(void) { return 0; } - -#endif /* !CONFIG_X86_64 */ - -/* common 32/64 bit code */ - #define ENABLE_CF8_EXT_CFG (1ULL << 46) static void enable_pci_io_ecs(void *unused) diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index 145df00e038..64a12288389 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -1,11 +1,11 @@ #include <linux/init.h> #include <linux/pci.h> +#include <linux/range.h> #include "bus_numa.h" int pci_root_num; struct pci_root_info pci_root_info[PCI_ROOT_NR]; -int found_all_numa_early; void x86_pci_root_bus_res_quirks(struct pci_bus *b) { @@ -21,10 +21,6 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) if (!pci_root_num) return; - /* for amd, if only one root bus, don't need to do anything */ - if (pci_root_num < 2 && found_all_numa_early) - return; - for (i = 0; i < pci_root_num; i++) { if (pci_root_info[i].bus_min == b->number) break; @@ -36,13 +32,14 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", b->number); + pci_bus_remove_resources(b); info = &pci_root_info[i]; for (j = 0; j < info->res_num; j++) { struct resource *res; struct resource *root; res = &info->res[j]; - b->resource[j] = res; + pci_bus_add_resource(b, res, 0); if (res->flags & IORESOURCE_IO) root = &ioport_resource; else @@ -51,8 +48,8 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) } } -void __init update_res(struct pci_root_info *info, size_t start, - size_t end, unsigned long flags, int merge) +void __devinit update_res(struct pci_root_info *info, resource_size_t start, + resource_size_t end, unsigned long flags, int merge) { int i; struct resource *res; @@ -60,25 +57,28 @@ void __init update_res(struct pci_root_info *info, size_t start, if (start > end) return; + if (start == MAX_RESOURCE) + return; + if (!merge) goto addit; /* try to merge it with old one */ for (i = 0; i < info->res_num; i++) { - size_t final_start, final_end; - size_t common_start, common_end; + resource_size_t final_start, final_end; + resource_size_t common_start, common_end; res = &info->res[i]; if (res->flags != flags) continue; - common_start = max((size_t)res->start, start); - common_end = min((size_t)res->end, end); + common_start = max(res->start, start); + common_end = min(res->end, end); if (common_start > common_end + 1) continue; - final_start = min((size_t)res->start, start); - final_end = max((size_t)res->end, end); + final_start = min(res->start, start); + final_end = max(res->end, end); res->start = final_start; res->end = final_end; diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h index adbc23fe82a..804a4b40c31 100644 --- a/arch/x86/pci/bus_numa.h +++ b/arch/x86/pci/bus_numa.h @@ -1,9 +1,8 @@ -#ifdef CONFIG_X86_64 - +#ifndef __BUS_NUMA_H +#define __BUS_NUMA_H /* * sub bus (transparent) will use entres from 3 to store extra from - * root, so need to make sure we have enough slot there, Should we - * increase PCI_BUS_NUM_RESOURCES? + * root, so need to make sure we have enough slot there. */ #define RES_NUM 16 struct pci_root_info { @@ -20,8 +19,7 @@ struct pci_root_info { #define PCI_ROOT_NR 4 extern int pci_root_num; extern struct pci_root_info pci_root_info[PCI_ROOT_NR]; -extern int found_all_numa_early; -extern void update_res(struct pci_root_info *info, size_t start, - size_t end, unsigned long flags, int merge); +extern void update_res(struct pci_root_info *info, resource_size_t start, + resource_size_t end, unsigned long flags, int merge); #endif diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index d2552c68e94..cf2e93869c4 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -9,6 +9,7 @@ #include <linux/ioport.h> #include <linux/init.h> #include <linux/dmi.h> +#include <linux/slab.h> #include <asm/acpi.h> #include <asm/segment.h> @@ -72,12 +73,6 @@ struct pci_ops pci_root_ops = { }; /* - * legacy, numa, and acpi all want to call pcibios_scan_root - * from their initcalls. This flag prevents that. - */ -int pcibios_scanned; - -/* * This interrupt-safe spinlock protects all accesses to PCI * configuration space. */ @@ -520,6 +515,9 @@ char * __devinit pcibios_setup(char *str) } else if (!strcmp(str, "use_crs")) { pci_probe |= PCI_USE__CRS; return NULL; + } else if (!strcmp(str, "nocrs")) { + pci_probe |= PCI_ROOT_NO_CRS; + return NULL; } else if (!strcmp(str, "earlydump")) { pci_early_dump_regs = 1; return NULL; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 5dc9e8c63fc..46fd43f7910 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -60,22 +60,20 @@ skip_isa_ioresource_align(struct pci_dev *dev) { * but we want to try to avoid allocating at 0x2900-0x2bff * which might have be mirrored at 0x0100-0x03ff.. */ -void -pcibios_align_resource(void *data, struct resource *res, +resource_size_t +pcibios_align_resource(void *data, const struct resource *res, resource_size_t size, resource_size_t align) { struct pci_dev *dev = data; + resource_size_t start = res->start; if (res->flags & IORESOURCE_IO) { - resource_size_t start = res->start; - if (skip_isa_ioresource_align(dev)) - return; - if (start & 0x300) { + return start; + if (start & 0x300) start = (start + 0x3ff) & ~0x3ff; - res->start = start; - } } + return start; } EXPORT_SYMBOL(pcibios_align_resource); @@ -129,9 +127,6 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) continue; if (!r->start || pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, - "can't reserve window %pR\n", - r); /* * Something is wrong with the region. * Invalidate the resource to prevent @@ -183,8 +178,6 @@ static void __init pcibios_allocate_resources(int pass) "BAR %d: reserving %pr (d=%d, p=%d)\n", idx, r, disabled, pass); if (pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, - "can't reserve %pR\n", r); /* We'll assign a new address later */ r->end -= r->start; r->start = 0; @@ -257,10 +250,6 @@ void __init pcibios_resource_survey(void) */ fs_initcall(pcibios_assign_resources); -void __weak x86_pci_root_bus_res_quirks(struct pci_bus *b) -{ -} - /* * If we set up a device for bus mastering, we need to check the latency * timer as certain crappy BIOSes forget to set it properly. diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c index 25a1f8efed4..adb62aaa7ec 100644 --- a/arch/x86/pci/init.c +++ b/arch/x86/pci/init.c @@ -1,6 +1,7 @@ #include <linux/pci.h> #include <linux/init.h> #include <asm/pci_x86.h> +#include <asm/x86_init.h> /* arch_initcall has too random ordering, so call the initializers in the right sequence from here. */ @@ -15,10 +16,9 @@ static __init int pci_arch_init(void) if (!(pci_probe & PCI_PROBE_NOEARLY)) pci_mmcfg_early_init(); -#ifdef CONFIG_PCI_OLPC - if (!pci_olpc_init()) - return 0; /* skip additional checks if it's an XO */ -#endif + if (x86_init.pci.arch_init && !x86_init.pci.arch_init()) + return 0; + #ifdef CONFIG_PCI_BIOS pci_pcbios_init(); #endif diff --git a/arch/x86/pci/intel_bus.c b/arch/x86/pci/intel_bus.c deleted file mode 100644 index b7a55dc55d1..00000000000 --- a/arch/x86/pci/intel_bus.c +++ /dev/null @@ -1,90 +0,0 @@ -/* - * to read io range from IOH pci conf, need to do it after mmconfig is there - */ - -#include <linux/delay.h> -#include <linux/dmi.h> -#include <linux/pci.h> -#include <linux/init.h> -#include <asm/pci_x86.h> - -#include "bus_numa.h" - -static inline void print_ioh_resources(struct pci_root_info *info) -{ - int res_num; - int busnum; - int i; - - printk(KERN_DEBUG "IOH bus: [%02x, %02x]\n", - info->bus_min, info->bus_max); - res_num = info->res_num; - busnum = info->bus_min; - for (i = 0; i < res_num; i++) { - struct resource *res; - - res = &info->res[i]; - printk(KERN_DEBUG "IOH bus: %02x index %x %s: [%llx, %llx]\n", - busnum, i, - (res->flags & IORESOURCE_IO) ? "io port" : - "mmio", - res->start, res->end); - } -} - -#define IOH_LIO 0x108 -#define IOH_LMMIOL 0x10c -#define IOH_LMMIOH 0x110 -#define IOH_LMMIOH_BASEU 0x114 -#define IOH_LMMIOH_LIMITU 0x118 -#define IOH_LCFGBUS 0x11c - -static void __devinit pci_root_bus_res(struct pci_dev *dev) -{ - u16 word; - u32 dword; - struct pci_root_info *info; - u16 io_base, io_end; - u32 mmiol_base, mmiol_end; - u64 mmioh_base, mmioh_end; - int bus_base, bus_end; - - if (pci_root_num >= PCI_ROOT_NR) { - printk(KERN_DEBUG "intel_bus.c: PCI_ROOT_NR is too small\n"); - return; - } - - info = &pci_root_info[pci_root_num]; - pci_root_num++; - - pci_read_config_word(dev, IOH_LCFGBUS, &word); - bus_base = (word & 0xff); - bus_end = (word & 0xff00) >> 8; - sprintf(info->name, "PCI Bus #%02x", bus_base); - info->bus_min = bus_base; - info->bus_max = bus_end; - - pci_read_config_word(dev, IOH_LIO, &word); - io_base = (word & 0xf0) << (12 - 4); - io_end = (word & 0xf000) | 0xfff; - update_res(info, io_base, io_end, IORESOURCE_IO, 0); - - pci_read_config_dword(dev, IOH_LMMIOL, &dword); - mmiol_base = (dword & 0xff00) << (24 - 8); - mmiol_end = (dword & 0xff000000) | 0xffffff; - update_res(info, mmiol_base, mmiol_end, IORESOURCE_MEM, 0); - - pci_read_config_dword(dev, IOH_LMMIOH, &dword); - mmioh_base = ((u64)(dword & 0xfc00)) << (26 - 10); - mmioh_end = ((u64)(dword & 0xfc000000) | 0x3ffffff); - pci_read_config_dword(dev, IOH_LMMIOH_BASEU, &dword); - mmioh_base |= ((u64)(dword & 0x7ffff)) << 32; - pci_read_config_dword(dev, IOH_LMMIOH_LIMITU, &dword); - mmioh_end |= ((u64)(dword & 0x7ffff)) << 32; - update_res(info, mmioh_base, mmioh_end, IORESOURCE_MEM, 0); - - print_ioh_resources(info); -} - -/* intel IOH */ -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x342e, pci_root_bus_res); diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 0696d506c4a..5d362b5ba06 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -8,7 +8,6 @@ #include <linux/kernel.h> #include <linux/pci.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/interrupt.h> #include <linux/dmi.h> #include <linux/io.h> @@ -53,7 +52,7 @@ struct irq_router_handler { int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device); }; -int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL; +int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq; void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL; /* @@ -590,6 +589,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route case PCI_DEVICE_ID_INTEL_ICH10_1: case PCI_DEVICE_ID_INTEL_ICH10_2: case PCI_DEVICE_ID_INTEL_ICH10_3: + case PCI_DEVICE_ID_INTEL_CPT_LPC1: + case PCI_DEVICE_ID_INTEL_CPT_LPC2: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; @@ -1016,7 +1017,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) return 1; } -static void __init pcibios_fixup_irqs(void) +void __init pcibios_fixup_irqs(void) { struct pci_dev *dev = NULL; u8 pin; @@ -1110,12 +1111,12 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = { { } }; -int __init pcibios_irq_init(void) +void __init pcibios_irq_init(void) { DBG(KERN_DEBUG "PCI: IRQ init\n"); - if (pcibios_enable_irq || raw_pci_ops == NULL) - return 0; + if (raw_pci_ops == NULL) + return; dmi_check_system(pciirq_dmi_table); @@ -1142,9 +1143,7 @@ int __init pcibios_irq_init(void) pirq_table = NULL; } - pcibios_enable_irq = pirq_enable_irq; - - pcibios_fixup_irqs(); + x86_init.pci.fixup_irqs(); if (io_apic_assign_pci_irqs && pci_routeirq) { struct pci_dev *dev = NULL; @@ -1157,8 +1156,6 @@ int __init pcibios_irq_init(void) for_each_pci_dev(dev) pirq_enable_irq(dev); } - - return 0; } static void pirq_penalize_isa_irq(int irq, int active) diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 4061bb0f267..0db5eaf5456 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -35,16 +35,13 @@ static void __devinit pcibios_fixup_peer_bridges(void) } } -static int __init pci_legacy_init(void) +int __init pci_legacy_init(void) { if (!raw_pci_ops) { printk("PCI: System does not support PCI\n"); return 0; } - if (pcibios_scanned++) - return 0; - printk("PCI: Probing PCI hardware\n"); pci_root_bus = pcibios_scan_root(0); if (pci_root_bus) @@ -55,18 +52,15 @@ static int __init pci_legacy_init(void) int __init pci_subsys_init(void) { -#ifdef CONFIG_X86_NUMAQ - pci_numaq_init(); -#endif -#ifdef CONFIG_ACPI - pci_acpi_init(); -#endif -#ifdef CONFIG_X86_VISWS - pci_visws_init(); -#endif - pci_legacy_init(); + /* + * The init function returns an non zero value when + * pci_legacy_init should be invoked. + */ + if (x86_init.pci.init()) + pci_legacy_init(); + pcibios_fixup_peer_bridges(); - pcibios_irq_init(); + x86_init.pci.init_irq(); pcibios_init(); return 0; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index b19d1e54201..39b9ebe8f88 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -16,6 +16,7 @@ #include <linux/sfi_acpi.h> #include <linux/bitmap.h> #include <linux/dmi.h> +#include <linux/slab.h> #include <asm/e820.h> #include <asm/pci_x86.h> #include <asm/acpi.h> @@ -303,22 +304,17 @@ static void __init pci_mmcfg_check_end_bus_number(void) { struct pci_mmcfg_region *cfg, *cfgx; - /* last one*/ - cfg = list_entry(pci_mmcfg_list.prev, typeof(*cfg), list); - if (cfg) - if (cfg->end_bus < cfg->start_bus) - cfg->end_bus = 255; - - if (list_is_singular(&pci_mmcfg_list)) - return; - - /* don't overlap please */ + /* Fixup overlaps */ list_for_each_entry(cfg, &pci_mmcfg_list, list) { if (cfg->end_bus < cfg->start_bus) cfg->end_bus = 255; + /* Don't access the list head ! */ + if (cfg->list.next == &pci_mmcfg_list) + break; + cfgx = list_entry(cfg->list.next, typeof(*cfg), list); - if (cfg != cfgx && cfg->end_bus >= cfgx->start_bus) + if (cfg->end_bus >= cfgx->start_bus) cfg->end_bus = cfgx->start_bus - 1; } } diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c new file mode 100644 index 00000000000..8bf2fcb88d0 --- /dev/null +++ b/arch/x86/pci/mrst.c @@ -0,0 +1,262 @@ +/* + * Moorestown PCI support + * Copyright (c) 2008 Intel Corporation + * Jesse Barnes <jesse.barnes@intel.com> + * + * Moorestown has an interesting PCI implementation: + * - configuration space is memory mapped (as defined by MCFG) + * - Lincroft devices also have a real, type 1 configuration space + * - Early Lincroft silicon has a type 1 access bug that will cause + * a hang if non-existent devices are accessed + * - some devices have the "fixed BAR" capability, which means + * they can't be relocated or modified; check for that during + * BAR sizing + * + * So, we use the MCFG space for all reads and writes, but also send + * Lincroft writes to type 1 space. But only read/write if the device + * actually exists, otherwise return all 1s for reads and bit bucket + * the writes. + */ + +#include <linux/sched.h> +#include <linux/pci.h> +#include <linux/ioport.h> +#include <linux/init.h> +#include <linux/dmi.h> + +#include <asm/acpi.h> +#include <asm/segment.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/pci_x86.h> +#include <asm/hw_irq.h> +#include <asm/io_apic.h> + +#define PCIE_CAP_OFFSET 0x100 + +/* Fixed BAR fields */ +#define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00 /* Fixed BAR (TBD) */ +#define PCI_FIXED_BAR_0_SIZE 0x04 +#define PCI_FIXED_BAR_1_SIZE 0x08 +#define PCI_FIXED_BAR_2_SIZE 0x0c +#define PCI_FIXED_BAR_3_SIZE 0x10 +#define PCI_FIXED_BAR_4_SIZE 0x14 +#define PCI_FIXED_BAR_5_SIZE 0x1c + +/** + * fixed_bar_cap - return the offset of the fixed BAR cap if found + * @bus: PCI bus + * @devfn: device in question + * + * Look for the fixed BAR cap on @bus and @devfn, returning its offset + * if found or 0 otherwise. + */ +static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn) +{ + int pos; + u32 pcie_cap = 0, cap_data; + + pos = PCIE_CAP_OFFSET; + + if (!raw_pci_ext_ops) + return 0; + + while (pos) { + if (raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, + devfn, pos, 4, &pcie_cap)) + return 0; + + if (pcie_cap == 0xffffffff) + return 0; + + if (PCI_EXT_CAP_ID(pcie_cap) == PCI_EXT_CAP_ID_VNDR) { + raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, + devfn, pos + 4, 4, &cap_data); + if ((cap_data & 0xffff) == PCIE_VNDR_CAP_ID_FIXED_BAR) + return pos; + } + + pos = pcie_cap >> 20; + } + + return 0; +} + +static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn, + int reg, int len, u32 val, int offset) +{ + u32 size; + unsigned int domain, busnum; + int bar = (reg - PCI_BASE_ADDRESS_0) >> 2; + + domain = pci_domain_nr(bus); + busnum = bus->number; + + if (val == ~0 && len == 4) { + unsigned long decode; + + raw_pci_ext_ops->read(domain, busnum, devfn, + offset + 8 + (bar * 4), 4, &size); + + /* Turn the size into a decode pattern for the sizing code */ + if (size) { + decode = size - 1; + decode |= decode >> 1; + decode |= decode >> 2; + decode |= decode >> 4; + decode |= decode >> 8; + decode |= decode >> 16; + decode++; + decode = ~(decode - 1); + } else { + decode = ~0; + } + + /* + * If val is all ones, the core code is trying to size the reg, + * so update the mmconfig space with the real size. + * + * Note: this assumes the fixed size we got is a power of two. + */ + return raw_pci_ext_ops->write(domain, busnum, devfn, reg, 4, + decode); + } + + /* This is some other kind of BAR write, so just do it. */ + return raw_pci_ext_ops->write(domain, busnum, devfn, reg, len, val); +} + +/** + * type1_access_ok - check whether to use type 1 + * @bus: bus number + * @devfn: device & function in question + * + * If the bus is on a Lincroft chip and it exists, or is not on a Lincroft at + * all, the we can go ahead with any reads & writes. If it's on a Lincroft, + * but doesn't exist, avoid the access altogether to keep the chip from + * hanging. + */ +static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) +{ + /* This is a workaround for A0 LNC bug where PCI status register does + * not have new CAP bit set. can not be written by SW either. + * + * PCI header type in real LNC indicates a single function device, this + * will prevent probing other devices under the same function in PCI + * shim. Therefore, use the header type in shim instead. + */ + if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE) + return 0; + if (bus == 0 && (devfn == PCI_DEVFN(2, 0) || devfn == PCI_DEVFN(0, 0))) + return 1; + return 0; /* langwell on others */ +} + +static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, + int size, u32 *value) +{ + if (type1_access_ok(bus->number, devfn, where)) + return pci_direct_conf1.read(pci_domain_nr(bus), bus->number, + devfn, where, size, value); + return raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, + devfn, where, size, value); +} + +static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, + int size, u32 value) +{ + int offset; + + /* On MRST, there is no PCI ROM BAR, this will cause a subsequent read + * to ROM BAR return 0 then being ignored. + */ + if (where == PCI_ROM_ADDRESS) + return 0; + + /* + * Devices with fixed BARs need special handling: + * - BAR sizing code will save, write ~0, read size, restore + * - so writes to fixed BARs need special handling + * - other writes to fixed BAR devices should go through mmconfig + */ + offset = fixed_bar_cap(bus, devfn); + if (offset && + (where >= PCI_BASE_ADDRESS_0 && where <= PCI_BASE_ADDRESS_5)) { + return pci_device_update_fixed(bus, devfn, where, size, value, + offset); + } + + /* + * On Moorestown update both real & mmconfig space + * Note: early Lincroft silicon can't handle type 1 accesses to + * non-existent devices, so just eat the write in that case. + */ + if (type1_access_ok(bus->number, devfn, where)) + return pci_direct_conf1.write(pci_domain_nr(bus), bus->number, + devfn, where, size, value); + return raw_pci_ext_ops->write(pci_domain_nr(bus), bus->number, devfn, + where, size, value); +} + +static int mrst_pci_irq_enable(struct pci_dev *dev) +{ + u8 pin; + struct io_apic_irq_attr irq_attr; + + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + + /* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to + * IOAPIC RTE entries, so we just enable RTE for the device. + */ + irq_attr.ioapic = mp_find_ioapic(dev->irq); + irq_attr.ioapic_pin = dev->irq; + irq_attr.trigger = 1; /* level */ + irq_attr.polarity = 1; /* active low */ + io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr); + + return 0; +} + +struct pci_ops pci_mrst_ops = { + .read = pci_read, + .write = pci_write, +}; + +/** + * pci_mrst_init - installs pci_mrst_ops + * + * Moorestown has an interesting PCI implementation (see above). + * Called when the early platform detection installs it. + */ +int __init pci_mrst_init(void) +{ + printk(KERN_INFO "Moorestown platform detected, using MRST PCI ops\n"); + pci_mmcfg_late_init(); + pcibios_enable_irq = mrst_pci_irq_enable; + pci_root_ops = pci_mrst_ops; + /* Continue with standard init */ + return 1; +} + +/* + * Langwell devices reside at fixed offsets, don't try to move them. + */ +static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev) +{ + unsigned long offset; + u32 size; + int i; + + /* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */ + offset = fixed_bar_cap(dev->bus, dev->devfn); + if (!offset || PCI_DEVFN(2, 0) == dev->devfn || + PCI_DEVFN(2, 2) == dev->devfn) + return; + + for (i = 0; i < PCI_ROM_RESOURCE; i++) { + pci_read_config_dword(dev, offset + 8 + (i * 4), &size); + dev->resource[i].end = dev->resource[i].start + size - 1; + dev->resource[i].flags |= IORESOURCE_PCI_FIXED; + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixed_bar_fixup); diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 8eb295e116f..8223738ad80 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -8,9 +8,7 @@ #include <asm/apic.h> #include <asm/mpspec.h> #include <asm/pci_x86.h> - -#define XQUAD_PORTIO_BASE 0xfe400000 -#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ +#include <asm/numaq.h> #define BUS2QUAD(global) (mp_bus_id_to_node[global]) @@ -18,8 +16,6 @@ #define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) -#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) - #define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3)) @@ -152,14 +148,8 @@ int __init pci_numaq_init(void) { int quad; - if (!found_numaq) - return 0; - raw_pci_ops = &pci_direct_conf1_mq; - if (pcibios_scanned++) - return 0; - pci_root_bus = pcibios_scan_root(0); if (pci_root_bus) pci_bus_add_devices(pci_root_bus); diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c index b889d824f7c..b34815408f5 100644 --- a/arch/x86/pci/olpc.c +++ b/arch/x86/pci/olpc.c @@ -304,9 +304,6 @@ static struct pci_raw_ops pci_olpc_conf = { int __init pci_olpc_init(void) { - if (!machine_is_olpc() || olpc_has_vsa()) - return -ENODEV; - printk(KERN_INFO "PCI: Using configuration type OLPC\n"); raw_pci_ops = &pci_olpc_conf; is_lx = is_geode_lx(); diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 1c975cc9839..59a225c17b8 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -4,6 +4,7 @@ #include <linux/pci.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/uaccess.h> #include <asm/pci_x86.h> diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c index bcead7a4687..03008f72eb0 100644 --- a/arch/x86/pci/visws.c +++ b/arch/x86/pci/visws.c @@ -69,9 +69,6 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq) int __init pci_visws_init(void) { - if (!is_visws_box()) - return -1; - pcibios_enable_irq = &pci_visws_enable_irq; pcibios_disable_irq = &pci_visws_disable_irq; @@ -90,5 +87,6 @@ int __init pci_visws_init(void) pci_scan_bus_with_sysdata(pci_bus1); pci_fixup_irqs(pci_common_swizzle, visws_map_irq); pcibios_resource_survey(); - return 0; + /* Request bus scan */ + return 1; } diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 81197c62d5b..3769079874d 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -6,6 +6,7 @@ * Copyright (c) 2006 Rafael J. Wysocki <rjw@sisk.pl> */ +#include <linux/gfp.h> #include <linux/suspend.h> #include <linux/bootmem.h> diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 65fdc86e923..d24f983ba1e 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -8,6 +8,7 @@ * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> */ +#include <linux/gfp.h> #include <linux/smp.h> #include <linux/suspend.h> #include <asm/proto.h> diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index b641388d828..ad47daeafa4 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -27,10 +27,17 @@ ENTRY(swsusp_arch_suspend) ret ENTRY(restore_image) + movl mmu_cr4_features, %ecx movl resume_pg_dir, %eax subl $__PAGE_OFFSET, %eax movl %eax, %cr3 + jecxz 1f # cr4 Pentium and higher, skip if zero + andl $~(X86_CR4_PGE), %ecx + movl %ecx, %cr4; # turn off PGE + movl %cr3, %eax; # flush TLB + movl %eax, %cr3 +1: movl restore_pblist, %edx .p2align 4,,7 @@ -54,16 +61,8 @@ done: movl $swapper_pg_dir, %eax subl $__PAGE_OFFSET, %eax movl %eax, %cr3 - /* Flush TLB, including "global" things (vmalloc) */ movl mmu_cr4_features, %ecx jecxz 1f # cr4 Pentium and higher, skip if zero - movl %ecx, %edx - andl $~(X86_CR4_PGE), %edx - movl %edx, %cr4; # turn off PGE -1: - movl %cr3, %eax; # flush TLB - movl %eax, %cr3 - jecxz 1f # cr4 Pentium and higher, skip if zero movl %ecx, %cr4; # turn PGE back on 1: diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk index 5bbb5a33f22..fd1ab80be0d 100644 --- a/arch/x86/tools/chkobjdump.awk +++ b/arch/x86/tools/chkobjdump.awk @@ -8,14 +8,24 @@ BEGIN { od_sver = 19; } -/^GNU/ { - split($3, ver, "."); +/^GNU objdump/ { + verstr = "" + for (i = 3; i <= NF; i++) + if (match($(i), "^[0-9]")) { + verstr = $(i); + break; + } + if (verstr == "") { + printf("Warning: Failed to find objdump version number.\n"); + exit 0; + } + split(verstr, ver, "."); if (ver[1] > od_ver || (ver[1] == od_ver && ver[2] >= od_sver)) { exit 1; } else { printf("Warning: objdump version %s is older than %d.%d\n", - $4, od_ver, od_sver); + verstr, od_ver, od_sver); print("Warning: Skipping posttest."); # Logic is inverted, because we just skip test without error. exit 0; diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index bee8d6ac269..13403fc95a9 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -43,7 +43,7 @@ static int x86_64; static void usage(void) { fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |" - " %s [-y|-n] [-v] \n", prog); + " %s [-y|-n] [-v]\n", prog); fprintf(stderr, "\t-y 64bit mode\n"); fprintf(stderr, "\t-n 32bit mode\n"); fprintf(stderr, "\t-v verbose mode\n"); @@ -69,7 +69,7 @@ static void dump_field(FILE *fp, const char *name, const char *indent, static void dump_insn(FILE *fp, struct insn *insn) { - fprintf(fp, "Instruction = { \n"); + fprintf(fp, "Instruction = {\n"); dump_field(fp, "prefixes", "\t", &insn->prefixes); dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix); dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix); diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 21e1aeb9f3e..ac74869b814 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -6,6 +6,7 @@ #include <linux/mm.h> #include <linux/err.h> #include <linux/sched.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/random.h> #include <linux/elf.h> diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index e133ce25e29..1304bcec8ee 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -1,5 +1,6 @@ #include <linux/init.h> #include <linux/debugfs.h> +#include <linux/slab.h> #include <linux/module.h> #include "debugfs.h" diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2b26dd5930c..65d8d79b46a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -28,6 +28,7 @@ #include <linux/highmem.h> #include <linux/console.h> #include <linux/pci.h> +#include <linux/gfp.h> #include <xen/xen.h> #include <xen/interface/xen.h> @@ -50,6 +51,7 @@ #include <asm/traps.h> #include <asm/setup.h> #include <asm/desc.h> +#include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/reboot.h> @@ -1094,6 +1096,12 @@ asmlinkage void __init xen_start_kernel(void) __supported_pte_mask |= _PAGE_IOMAP; + /* + * Prevent page tables from being allocated in highmem, even + * if CONFIG_HIGHPTE is enabled. + */ + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + /* Work out if we support NX */ x86_configure_nx(); @@ -1151,9 +1159,13 @@ asmlinkage void __init xen_start_kernel(void) /* keep using Xen gdt for now; no urgent need to change it */ +#ifdef CONFIG_X86_32 pv_info.kernel_rpl = 1; if (xen_feature(XENFEAT_supervisor_mode_kernel)) pv_info.kernel_rpl = 0; +#else + pv_info.kernel_rpl = 0; +#endif /* set the limit of our address space */ xen_reserve_top(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index bf4cd6bfe95..914f04695ce 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -43,6 +43,7 @@ #include <linux/debugfs.h> #include <linux/bug.h> #include <linux/module.h> +#include <linux/gfp.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -1427,23 +1428,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) #endif } -#ifdef CONFIG_HIGHPTE -static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) -{ - pgprot_t prot = PAGE_KERNEL; - - if (PagePinned(page)) - prot = PAGE_KERNEL_RO; - - if (0 && PageHighMem(page)) - printk("mapping highpte %lx type %d prot %s\n", - page_to_pfn(page), type, - (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); - - return kmap_atomic_prot(page, type, prot); -} -#endif - #ifdef CONFIG_X86_32 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { @@ -1902,10 +1886,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .alloc_pmd_clone = paravirt_nop, .release_pmd = xen_release_pmd_init, -#ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = xen_kmap_atomic_pte, -#endif - #ifdef CONFIG_X86_64 .set_pte = xen_set_pte, #else diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 563d2050498..a29693fd313 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -14,6 +14,7 @@ */ #include <linux/sched.h> #include <linux/err.h> +#include <linux/slab.h> #include <linux/smp.h> #include <asm/paravirt.h> @@ -361,7 +362,7 @@ static void xen_cpu_die(unsigned int cpu) alternatives_smp_switch(0); } -static void __cpuinit xen_play_dead(void) /* used only with CPU_HOTPLUG */ +static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */ { play_dead_common(); HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 24ded31b5ae..e0500646585 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -6,6 +6,7 @@ #include <linux/spinlock.h> #include <linux/debugfs.h> #include <linux/log2.h> +#include <linux/gfp.h> #include <asm/paravirt.h> diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 0d3f07cd1b5..32764b8880b 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -13,6 +13,7 @@ #include <linux/clockchips.h> #include <linux/kernel_stat.h> #include <linux/math64.h> +#include <linux/gfp.h> #include <asm/pvclock.h> #include <asm/xen/hypervisor.h> diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 88e15deb8b8..22a2093b586 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -90,9 +90,9 @@ ENTRY(xen_iret) GET_THREAD_INFO(%eax) movl TI_cpu(%eax), %eax movl __per_cpu_offset(,%eax,4), %eax - mov per_cpu__xen_vcpu(%eax), %eax + mov xen_vcpu(%eax), %eax #else - movl per_cpu__xen_vcpu, %eax + movl xen_vcpu, %eax #endif /* check IF state we're restoring */ |