diff options
Diffstat (limited to 'arch/x86')
294 files changed, 13561 insertions, 7664 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fc20fdc0f7f..178084b4377 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,7 +24,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE - select HAVE_PERF_COUNTERS if (!M386 && !M486) + select HAVE_PERF_EVENTS if (!M386 && !M486) select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB @@ -49,6 +49,7 @@ config X86 select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA + select HAVE_HW_BREAKPOINT select HAVE_ARCH_KMEMCHECK config OUTPUT_FORMAT @@ -86,10 +87,6 @@ config STACKTRACE_SUPPORT config HAVE_LATENCYTOP_SUPPORT def_bool y -config FAST_CMPXCHG_LOCAL - bool - default y - config MMU def_bool y @@ -150,7 +147,10 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y -config HAVE_DYNAMIC_PER_CPU_AREA +config NEED_PER_CPU_EMBED_FIRST_CHUNK + def_bool y + +config NEED_PER_CPU_PAGE_FIRST_CHUNK def_bool y config HAVE_CPUMASK_OF_CPU_MAP @@ -179,6 +179,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_INTEL_TXT + def_bool y + depends on EXPERIMENTAL && DMAR && ACPI + # Use the generic interrupt handling code in kernel/irq/: config GENERIC_HARDIRQS bool @@ -318,6 +322,7 @@ config X86_EXTENDED_PLATFORM SGI 320/540 (Visual Workstation) Summit/EXA (IBM x440) Unisys ES7000 IA32 series + Moorestown MID devices If you have one of these systems, or if you want to build a generic distribution kernel, say Y here - otherwise say N. @@ -377,6 +382,18 @@ config X86_ELAN If unsure, choose "PC-compatible" instead. +config X86_MRST + bool "Moorestown MID platform" + depends on X86_32 + depends on X86_EXTENDED_PLATFORM + ---help--- + Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin + Internet Device(MID) platform. Moorestown consists of two chips: + Lincroft (CPU core, graphics, and memory controller) and Langwell IOH. + Unlike standard x86 PCs, Moorestown does not have many legacy devices + nor standard legacy replacement devices/features. e.g. Moorestown does + not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. + config X86_RDC321X bool "RDC R-321x SoC" depends on X86_32 @@ -412,6 +429,17 @@ config X86_NUMAQ of Flat Logical. You will need a new lynxer.elf file to flash your firmware with - send email to <Martin.Bligh@us.ibm.com>. +config X86_SUPPORTS_MEMORY_FAILURE + bool + # MCE code calls memory_failure(): + depends on X86_MCE + # On 32-bit this adds too big of NODES_SHIFT and we run out of page flags: + depends on !X86_NUMAQ + # On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH: + depends on X86_64 || !SPARSEMEM + select ARCH_SUPPORTS_MEMORY_FAILURE + default y + config X86_VISWS bool "SGI 320/540 (Visual Workstation)" depends on X86_32 && PCI && X86_MPPARSE && PCI_GODIRECT @@ -464,7 +492,7 @@ if PARAVIRT_GUEST source "arch/x86/xen/Kconfig" config VMI - bool "VMI Guest support" + bool "VMI Guest support (DEPRECATED)" select PARAVIRT depends on X86_32 ---help--- @@ -473,6 +501,15 @@ config VMI at the moment), by linking the kernel to a GPL-ed ROM module provided by the hypervisor. + As of September 2009, VMware has started a phased retirement + of this feature from VMware's products. Please see + feature-removal-schedule.txt for details. If you are + planning to enable this option, please note that you cannot + live migrate a VMI enabled VM to a future VMware product, + which doesn't support VMI. So if you expect your kernel to + seamlessly migrate to newer VMware products, keep this + disabled. + config KVM_CLOCK bool "KVM paravirtualized clock" select PARAVIRT @@ -776,41 +813,17 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS increased on these systems. config X86_MCE - bool "Machine Check Exception" + bool "Machine Check / overheating reporting" ---help--- - Machine Check Exception support allows the processor to notify the - kernel if it detects a problem (e.g. overheating, component failure). + Machine Check support allows the processor to notify the + kernel if it detects a problem (e.g. overheating, data corruption). The action the kernel takes depends on the severity of the problem, - ranging from a warning message on the console, to halting the machine. - Your processor must be a Pentium or newer to support this - check the - flags in /proc/cpuinfo for mce. Note that some older Pentium systems - have a design flaw which leads to false MCE events - hence MCE is - disabled on all P5 processors, unless explicitly enabled with "mce" - as a boot argument. Similarly, if MCE is built in and creates a - problem on some new non-standard machine, you can boot with "nomce" - to disable it. MCE support simply ignores non-MCE processors like - the 386 and 486, so nearly everyone can say Y here. - -config X86_OLD_MCE - depends on X86_32 && X86_MCE - bool "Use legacy machine check code (will go away)" - default n - select X86_ANCIENT_MCE - ---help--- - Use the old i386 machine check code. This is merely intended for - testing in a transition period. Try this if you run into any machine - check related software problems, but report the problem to - linux-kernel. When in doubt say no. - -config X86_NEW_MCE - depends on X86_MCE - bool - default y if (!X86_OLD_MCE && X86_32) || X86_64 + ranging from warning messages to halting the machine. config X86_MCE_INTEL def_bool y prompt "Intel MCE features" - depends on X86_NEW_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC ---help--- Additional support for intel specific MCE features such as the thermal monitor. @@ -818,14 +831,14 @@ config X86_MCE_INTEL config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_NEW_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC ---help--- Additional support for AMD specific MCE features such as the DRAM Error Threshold. config X86_ANCIENT_MCE def_bool n - depends on X86_32 + depends on X86_32 && X86_MCE prompt "Support for old Pentium 5 / WinChip machine checks" ---help--- Include support for machine check handling on old Pentium 5 or WinChip @@ -838,36 +851,16 @@ config X86_MCE_THRESHOLD default y config X86_MCE_INJECT - depends on X86_NEW_MCE + depends on X86_MCE tristate "Machine check injector support" ---help--- Provide support for injecting machine checks for testing purposes. If you don't know what a machine check is and you don't do kernel QA it is safe to say n. -config X86_MCE_NONFATAL - tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" - depends on X86_OLD_MCE - ---help--- - Enabling this feature starts a timer that triggers every 5 seconds which - will look at the machine check registers to see if anything happened. - Non-fatal problems automatically get corrected (but still logged). - Disable this if you don't want to see these messages. - Seeing the messages this option prints out may be indicative of dying - or out-of-spec (ie, overclocked) hardware. - This option only does something on certain CPUs. - (AMD Athlon/Duron and Intel Pentium 4) - -config X86_MCE_P4THERMAL - bool "check for P4 thermal throttling interrupt." - depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP) - ---help--- - Enabling this feature will cause a message to be printed when the P4 - enters thermal throttling. - config X86_THERMAL_VECTOR def_bool y - depends on X86_MCE_P4THERMAL || X86_MCE_INTEL + depends on X86_MCE_INTEL config VM86 bool "Enable VM86 support" if EMBEDDED @@ -1228,6 +1221,10 @@ config ARCH_DISCONTIGMEM_DEFAULT def_bool y depends on NUMA && X86_32 +config ARCH_PROC_KCORE_TEXT + def_bool y + depends on X86_64 && PROC_KCORE + config ARCH_SPARSEMEM_DEFAULT def_bool y depends on X86_64 @@ -1413,6 +1410,10 @@ config X86_PAT If unsure, say Y. +config ARCH_USES_PG_UNCACHED + def_bool y + depends on X86_PAT + config EFI bool "EFI runtime service support" depends on ACPI @@ -1443,12 +1444,8 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. -config CC_STACKPROTECTOR_ALL - bool - config CC_STACKPROTECTOR bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)" - select CC_STACKPROTECTOR_ALL ---help--- This option turns on the -fstack-protector GCC feature. This feature puts, at the beginning of functions, a canary value on @@ -1682,6 +1679,8 @@ source "kernel/power/Kconfig" source "drivers/acpi/Kconfig" +source "drivers/sfi/Kconfig" + config X86_APM_BOOT bool default y @@ -1877,7 +1876,7 @@ config PCI_DIRECT config PCI_MMCONFIG def_bool y - depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) + depends on X86_32 && PCI && (ACPI || SFI) && (PCI_GOMMCONFIG || PCI_GOANY) config PCI_OLPC def_bool y @@ -1915,7 +1914,7 @@ config DMAR_DEFAULT_ON config DMAR_BROKEN_GFX_WA def_bool n prompt "Workaround broken graphics drivers (going away soon)" - depends on DMAR + depends on DMAR && BROKEN ---help--- Current Graphics drivers tend to use physical address for DMA and avoid using DMA APIs. Setting this config diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 979de294710..5e99762eb5c 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -400,7 +400,7 @@ config X86_TSC config X86_CMPXCHG64 def_bool y - depends on X86_PAE || X86_64 + depends on !M386 && !M486 # this should be set for all -march=.. options where the compiler # generates cmov. @@ -412,6 +412,7 @@ config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 default "6" if X86_32 && X86_P6_NOP + default "5" if X86_32 && X86_CMPXCHG64 default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK) default "3" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index d105f29bb6b..731318e5ac1 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -186,6 +186,15 @@ config X86_DS_SELFTEST config HAVE_MMIOTRACE_SUPPORT def_bool y +config X86_DECODER_SELFTEST + bool "x86 instruction decoder selftest" + depends on DEBUG_KERNEL + ---help--- + Perform x86 instruction decoder selftests at build time. + This option is useful for checking the sanity of x86 instruction + decoder code. + If unsure, say "N". + # # IO delay types: # @@ -287,4 +296,18 @@ config OPTIMIZE_INLINING If unsure, say N. +config DEBUG_STRICT_USER_COPY_CHECKS + bool "Strict copy size checks" + depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING + ---help--- + Enabling this option turns a certain set of sanity checks for user + copy operations into compile time failures. + + The copy_from_user() etc checks are there to help test if there + are sufficient security checks on the length argument of + the copy operation, by having gcc prove that the argument is + within bounds. + + If unsure, or if you run an older (pre 4.4) gcc, say N. + endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 7983c420eaf..78b32be55e9 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -76,7 +76,6 @@ ifdef CONFIG_CC_STACKPROTECTOR cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(biarch)),y) stackp-y := -fstack-protector - stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all KBUILD_CFLAGS += $(stackp-y) else $(warning stack protector enabled but no compiler support) @@ -156,6 +155,9 @@ all: bzImage KBUILD_IMAGE := $(boot)/bzImage bzImage: vmlinux +ifeq ($(CONFIG_X86_DECODER_SELFTEST),y) + $(Q)$(MAKE) $(build)=arch/x86/tools posttest +endif $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ @@ -179,8 +181,8 @@ archclean: define archhelp echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' echo ' install - Install kernel using' - echo ' (your) ~/bin/installkernel or' - echo ' (distribution) /sbin/installkernel or' + echo ' (your) ~/bin/$(INSTALLKERNEL) or' + echo ' (distribution) /sbin/$(INSTALLKERNEL) or' echo ' install to $$(INSTALL_PATH) and run lilo' echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 75e4f001e70..f543b70ffae 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -23,13 +23,14 @@ */ .text +#include <linux/init.h> #include <linux/linkage.h> #include <asm/segment.h> #include <asm/page_types.h> #include <asm/boot.h> #include <asm/asm-offsets.h> - .section ".text.head","ax",@progbits + __HEAD ENTRY(startup_32) cld /* diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index f62c284db9e..077e1b69198 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -24,6 +24,7 @@ .code32 .text +#include <linux/init.h> #include <linux/linkage.h> #include <asm/segment.h> #include <asm/pgtable_types.h> @@ -33,7 +34,7 @@ #include <asm/processor-flags.h> #include <asm/asm-offsets.h> - .section ".text.head" + __HEAD .code32 ENTRY(startup_32) cld diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index cc353e1b3ff..f4193bb4878 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -1,3 +1,5 @@ +#include <asm-generic/vmlinux.lds.h> + OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) #undef i386 @@ -18,9 +20,9 @@ SECTIONS * address 0. */ . = 0; - .text.head : { + .head.text : { _head = . ; - *(.text.head) + HEAD_TEXT _ehead = . ; } .rodata.compressed : { diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh index 8d60ee15dfd..d13ec1c3864 100644 --- a/arch/x86/boot/install.sh +++ b/arch/x86/boot/install.sh @@ -33,8 +33,8 @@ verify "$3" # User may have a custom install script -if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi -if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi +if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi +if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi # Default install - same as make zlilo diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index 0f6ec455a2b..03c0683636b 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -53,6 +53,9 @@ SECTIONS /DISCARD/ : { *(.note*) } + /* + * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + */ . = ASSERT(_end <= 0x8000, "Setup too big!"); . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!"); /* Necessary for the very-old-loader check to work... */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 585edebe12c..49c552c060e 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -82,7 +82,7 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx, return -EINVAL; } - if (irq_fpu_usable()) + if (!irq_fpu_usable()) err = crypto_aes_expand_key(ctx, in_key, key_len); else { kernel_fpu_begin(); @@ -103,7 +103,7 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - if (irq_fpu_usable()) + if (!irq_fpu_usable()) crypto_aes_encrypt_x86(ctx, dst, src); else { kernel_fpu_begin(); @@ -116,7 +116,7 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - if (irq_fpu_usable()) + if (!irq_fpu_usable()) crypto_aes_decrypt_x86(ctx, dst, src); else { kernel_fpu_begin(); @@ -342,7 +342,7 @@ static int ablk_encrypt(struct ablkcipher_request *req) struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); - if (irq_fpu_usable()) { + if (!irq_fpu_usable()) { struct ablkcipher_request *cryptd_req = ablkcipher_request_ctx(req); memcpy(cryptd_req, req, sizeof(*req)); @@ -363,7 +363,7 @@ static int ablk_decrypt(struct ablkcipher_request *req) struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); - if (irq_fpu_usable()) { + if (!irq_fpu_usable()) { struct ablkcipher_request *cryptd_req = ablkcipher_request_ctx(req); memcpy(cryptd_req, req, sizeof(*req)); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ba331bfd111..581b0568fe1 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -21,8 +21,8 @@ #define __AUDIT_ARCH_LE 0x40000000 #ifndef CONFIG_AUDITSYSCALL -#define sysexit_audit int_ret_from_sys_call -#define sysretl_audit int_ret_from_sys_call +#define sysexit_audit ia32_ret_from_sys_call +#define sysretl_audit ia32_ret_from_sys_call #endif #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) @@ -39,12 +39,12 @@ .endm /* clobbers %eax */ - .macro CLEAR_RREGS _r9=rax + .macro CLEAR_RREGS offset=0, _r9=rax xorl %eax,%eax - movq %rax,R11(%rsp) - movq %rax,R10(%rsp) - movq %\_r9,R9(%rsp) - movq %rax,R8(%rsp) + movq %rax,\offset+R11(%rsp) + movq %rax,\offset+R10(%rsp) + movq %\_r9,\offset+R9(%rsp) + movq %rax,\offset+R8(%rsp) .endm /* @@ -172,6 +172,10 @@ sysexit_from_sys_call: movl RIP-R11(%rsp),%edx /* User %eip */ CFI_REGISTER rip,rdx RESTORE_ARGS 1,24,1,1,1,1 + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 popfq CFI_ADJUST_CFA_OFFSET -8 /*CFI_RESTORE rflags*/ @@ -200,9 +204,9 @@ sysexit_from_sys_call: movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ .endm - .macro auditsys_exit exit,ebpsave=RBP + .macro auditsys_exit exit testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) - jnz int_ret_from_sys_call + jnz ia32_ret_from_sys_call TRACE_IRQS_ON sti movl %eax,%esi /* second arg, syscall return value */ @@ -213,13 +217,13 @@ sysexit_from_sys_call: call audit_syscall_exit GET_THREAD_INFO(%r10) movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ - movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi cli TRACE_IRQS_OFF testl %edi,TI_flags(%r10) - jnz int_with_check - jmp \exit + jz \exit + CLEAR_RREGS -ARGOFFSET + jmp int_with_check .endm sysenter_auditsys: @@ -329,6 +333,9 @@ sysretl_from_sys_call: CFI_REGISTER rip,rcx movl EFLAGS-ARGOFFSET(%rsp),%r11d /*CFI_REGISTER rflags,r11*/ + xorq %r10,%r10 + xorq %r9,%r9 + xorq %r8,%r8 TRACE_IRQS_ON movl RSP-ARGOFFSET(%rsp),%esp CFI_RESTORE rsp @@ -343,7 +350,7 @@ cstar_auditsys: jmp cstar_dispatch sysretl_audit: - auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */ + auditsys_exit sysretl_from_sys_call #endif cstar_tracesys: @@ -353,7 +360,7 @@ cstar_tracesys: #endif xchgl %r9d,%ebp SAVE_REST - CLEAR_RREGS r9 + CLEAR_RREGS 0, r9 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter @@ -425,6 +432,8 @@ ia32_do_call: call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: movq %rax,RAX-ARGOFFSET(%rsp) +ia32_ret_from_sys_call: + CLEAR_RREGS -ARGOFFSET jmp int_ret_from_sys_call ia32_tracesys: @@ -442,8 +451,8 @@ END(ia32_syscall) ia32_badsys: movq $0,ORIG_RAX-ARGOFFSET(%rsp) - movq $-ENOSYS,RAX-ARGOFFSET(%rsp) - jmp int_ret_from_sys_call + movq $-ENOSYS,%rax + jmp ia32_sysret quiet_ni_syscall: movq $-ENOSYS,%rax @@ -831,5 +840,5 @@ ia32_sys_call_table: .quad compat_sys_preadv .quad compat_sys_pwritev .quad compat_sys_rt_tgsigqueueinfo /* 335 */ - .quad sys_perf_counter_open + .quad sys_perf_event_open ia32_syscall_end: diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4a8e80cdcfa..9f828f87ca3 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -10,6 +10,7 @@ header-y += ptrace-abi.h header-y += sigcontext32.h header-y += ucontext.h header-y += processor-flags.h +header-y += hw_breakpoint.h unifdef-y += e820.h unifdef-y += ist.h diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h index bb70e397aa8..7a15588e45d 100644 --- a/arch/x86/include/asm/a.out-core.h +++ b/arch/x86/include/asm/a.out-core.h @@ -17,6 +17,7 @@ #include <linux/user.h> #include <linux/elfcore.h> +#include <asm/debugreg.h> /* * fill in the user structure for an a.out core dump @@ -32,14 +33,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump) >> PAGE_SHIFT; dump->u_dsize -= dump->u_tsize; dump->u_ssize = 0; - dump->u_debugreg[0] = current->thread.debugreg0; - dump->u_debugreg[1] = current->thread.debugreg1; - dump->u_debugreg[2] = current->thread.debugreg2; - dump->u_debugreg[3] = current->thread.debugreg3; - dump->u_debugreg[4] = 0; - dump->u_debugreg[5] = 0; - dump->u_debugreg[6] = current->thread.debugreg6; - dump->u_debugreg[7] = current->thread.debugreg7; + aout_dump_debugregs(dump); if (dump->start_stack < TASK_SIZE) dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack)) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 20d1465a2ab..4518dc50090 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -144,7 +144,6 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) #else /* !CONFIG_ACPI */ -#define acpi_disabled 1 #define acpi_lapic 0 #define acpi_ioapic 0 static inline void acpi_noirq_set(void) { } diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h index 9825cd64c9b..eec2a70d437 100644 --- a/arch/x86/include/asm/agp.h +++ b/arch/x86/include/asm/agp.h @@ -22,10 +22,6 @@ */ #define flush_agp_cache() wbinvd() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index e2077d343c3..b97f786a48d 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -1,17 +1,13 @@ #ifdef __ASSEMBLY__ -#ifdef CONFIG_X86_32 -# define X86_ALIGN .long -#else -# define X86_ALIGN .quad -#endif +#include <asm/asm.h> #ifdef CONFIG_SMP .macro LOCK_PREFIX 1: lock .section .smp_locks,"a" - .align 4 - X86_ALIGN 1b + _ASM_ALIGN + _ASM_PTR 1b .previous .endm #else diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index c240efc74e0..69b74a7b877 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -84,6 +84,7 @@ static inline void alternatives_smp_switch(int smp) {} " .byte " __stringify(feature) "\n" /* feature bit */ \ " .byte 662b-661b\n" /* sourcelen */ \ " .byte 664f-663f\n" /* replacementlen */ \ + " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ ".previous\n" \ ".section .altinstr_replacement, \"ax\"\n" \ "663:\n\t" newinstr "\n664:\n" /* replacement */ \ diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index ac95995b7ba..5af2982133b 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -23,18 +23,13 @@ #include <linux/irqreturn.h> #ifdef CONFIG_AMD_IOMMU -extern int amd_iommu_init(void); -extern int amd_iommu_init_dma_ops(void); -extern int amd_iommu_init_passthrough(void); + extern void amd_iommu_detect(void); -extern irqreturn_t amd_iommu_int_handler(int irq, void *data); -extern void amd_iommu_flush_all_domains(void); -extern void amd_iommu_flush_all_devices(void); -extern void amd_iommu_shutdown(void); + #else -static inline int amd_iommu_init(void) { return -ENODEV; } + static inline void amd_iommu_detect(void) { } -static inline void amd_iommu_shutdown(void) { } + #endif #endif /* _ASM_X86_AMD_IOMMU_H */ diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h new file mode 100644 index 00000000000..84786fb9a23 --- /dev/null +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2009 Advanced Micro Devices, Inc. + * Author: Joerg Roedel <joerg.roedel@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _ASM_X86_AMD_IOMMU_PROTO_H +#define _ASM_X86_AMD_IOMMU_PROTO_H + +struct amd_iommu; + +extern int amd_iommu_init_dma_ops(void); +extern int amd_iommu_init_passthrough(void); +extern irqreturn_t amd_iommu_int_handler(int irq, void *data); +extern void amd_iommu_flush_all_domains(void); +extern void amd_iommu_flush_all_devices(void); +extern void amd_iommu_apply_erratum_63(u16 devid); +extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); + +#ifndef CONFIG_AMD_IOMMU_STATS + +static inline void amd_iommu_stats_init(void) { } + +#endif /* !CONFIG_AMD_IOMMU_STATS */ + +#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */ diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 2a2cc7a78a8..ba19ad4c47d 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -25,6 +25,11 @@ #include <linux/spinlock.h> /* + * Maximum number of IOMMUs supported + */ +#define MAX_IOMMUS 32 + +/* * some size calculation constants */ #define DEV_TABLE_ENTRY_SIZE 32 @@ -206,6 +211,9 @@ extern bool amd_iommu_dump; printk(KERN_INFO "AMD-Vi: " format, ## arg); \ } while(0); +/* global flag if IOMMUs cache non-present entries */ +extern bool amd_iommu_np_cache; + /* * Make iterating over all IOMMUs easier */ @@ -226,6 +234,8 @@ extern bool amd_iommu_dump; * independent of their use. */ struct protection_domain { + struct list_head list; /* for list of all protection domains */ + struct list_head dev_list; /* List of all devices in this domain */ spinlock_t lock; /* mostly used to lock the page table*/ u16 id; /* the domain id written to the device table */ int mode; /* paging mode (0-6 levels) */ @@ -233,7 +243,20 @@ struct protection_domain { unsigned long flags; /* flags to find out type of domain */ bool updated; /* complete domain flush required */ unsigned dev_cnt; /* devices assigned to this domain */ + unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ void *priv; /* private data */ + +}; + +/* + * This struct contains device specific data for the IOMMU + */ +struct iommu_dev_data { + struct list_head list; /* For domain->dev_list */ + struct device *dev; /* Device this data belong to */ + struct device *alias; /* The Alias Device */ + struct protection_domain *domain; /* Domain the device is bound to */ + atomic_t bind; /* Domain attach reverent count */ }; /* @@ -291,6 +314,9 @@ struct dma_ops_domain { struct amd_iommu { struct list_head list; + /* Index within the IOMMU array */ + int index; + /* locks the accesses to the hardware */ spinlock_t lock; @@ -357,6 +383,21 @@ struct amd_iommu { extern struct list_head amd_iommu_list; /* + * Array with pointers to each IOMMU struct + * The indices are referenced in the protection domains + */ +extern struct amd_iommu *amd_iommus[MAX_IOMMUS]; + +/* Number of IOMMUs present in the system */ +extern int amd_iommus_present; + +/* + * Declarations for the global list of all protection domains + */ +extern spinlock_t amd_iommu_pd_lock; +extern struct list_head amd_iommu_pd_list; + +/* * Structure defining one entry in the device table */ struct dev_table_entry { @@ -416,15 +457,9 @@ extern unsigned amd_iommu_aperture_order; /* largest PCI device id we expect translation requests for */ extern u16 amd_iommu_last_bdf; -/* data structures for protection domain handling */ -extern struct protection_domain **amd_iommu_pd_table; - /* allocation bitmap for domain ids */ extern unsigned long *amd_iommu_pd_alloc_bitmap; -/* will be 1 if device isolation is enabled */ -extern bool amd_iommu_isolate; - /* * If true, the addresses will be flushed on unmap time, not when * they are reused @@ -462,11 +497,6 @@ struct __iommu_counter { #define ADD_STATS_COUNTER(name, x) #define SUB_STATS_COUNTER(name, x) -static inline void amd_iommu_stats_init(void) { } - #endif /* CONFIG_AMD_IOMMU_STATS */ -/* some function prototypes */ -extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); - #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 586b7adb8e5..b4ac2cdcb64 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -66,13 +66,23 @@ static inline void default_inquire_remote_apic(int apicid) } /* + * With 82489DX we can't rely on apic feature bit + * retrieved via cpuid but still have to deal with + * such an apic chip so we assume that SMP configuration + * is found from MP table (64bit case uses ACPI mostly + * which set smp presence flag as well so we are safe + * to use this helper too). + */ +static inline bool apic_from_smp_config(void) +{ + return smp_found_config && !disable_apic; +} + +/* * Basic functions accessing APICs. */ #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> -#else -#define setup_boot_clock setup_boot_APIC_clock -#define setup_secondary_clock setup_secondary_APIC_clock #endif #ifdef CONFIG_X86_64 @@ -252,6 +262,8 @@ static inline void lapic_shutdown(void) { } static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } static inline void apic_disable(void) { } +# define setup_boot_APIC_clock x86_init_noop +# define setup_secondary_APIC_clock x86_init_noop #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_64 @@ -285,22 +297,22 @@ struct apic { int disable_esr; int dest_logical; - unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid); + unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid); unsigned long (*check_apicid_present)(int apicid); void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); void (*init_apic_ldr)(void); - physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map); + void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); void (*setup_apic_routing)(void); int (*multi_timer_check)(int apic, int irq); int (*apicid_to_node)(int logical_apicid); int (*cpu_to_logical_apicid)(int cpu); int (*cpu_present_to_apicid)(int mps_cpu); - physid_mask_t (*apicid_to_cpu_present)(int phys_apicid); + void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); void (*setup_portio_remap)(void); - int (*check_phys_apicid_present)(int boot_cpu_physical_apicid); + int (*check_phys_apicid_present)(int phys_apicid); void (*enable_apic_mode)(void); int (*phys_pkg_id)(int cpuid_apic, int index_msb); @@ -434,7 +446,7 @@ extern struct apic apic_x2apic_uv_x; DECLARE_PER_CPU(int, x2apic_extra_bits); extern int default_cpu_present_to_apicid(int mps_cpu); -extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid); +extern int default_check_phys_apicid_present(int phys_apicid); #endif static inline void default_wait_for_init_deassert(atomic_t *deassert) @@ -476,6 +488,8 @@ static inline unsigned int read_apic_id(void) extern void default_setup_apic_routing(void); +extern struct apic apic_noop; + #ifdef CONFIG_X86_32 extern struct apic apic_default; @@ -520,9 +534,9 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, return (unsigned int)(mask1 & mask2 & mask3); } -static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid) +static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) { - return physid_isset(apicid, bitmap); + return physid_isset(apicid, *map); } static inline unsigned long default_check_apicid_present(int bit) @@ -530,9 +544,9 @@ static inline unsigned long default_check_apicid_present(int bit) return physid_isset(bit, phys_cpu_present_map); } -static inline physid_mask_t default_ioapic_phys_id_map(physid_mask_t phys_map) +static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { - return phys_map; + *retmap = *phys_map; } /* Mapping from cpu number to logical apicid */ @@ -550,9 +564,9 @@ static inline int __default_cpu_present_to_apicid(int mps_cpu) } static inline int -__default_check_phys_apicid_present(int boot_cpu_physical_apicid) +__default_check_phys_apicid_present(int phys_apicid) { - return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map); + return physid_isset(phys_apicid, phys_cpu_present_map); } #ifdef CONFIG_X86_32 @@ -562,20 +576,15 @@ static inline int default_cpu_present_to_apicid(int mps_cpu) } static inline int -default_check_phys_apicid_present(int boot_cpu_physical_apicid) +default_check_phys_apicid_present(int phys_apicid) { - return __default_check_phys_apicid_present(boot_cpu_physical_apicid); + return __default_check_phys_apicid_present(phys_apicid); } #else extern int default_cpu_present_to_apicid(int mps_cpu); -extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid); +extern int default_check_phys_apicid_present(int phys_apicid); #endif -static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid) -{ - return physid_mask_of_physid(phys_apicid); -} - #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 7386bfa4f4b..7fe3b3060f0 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -11,10 +11,17 @@ #define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000 #define APIC_DEFAULT_PHYS_BASE 0xfee00000 +/* + * This is the IO-APIC register space as specified + * by Intel docs: + */ +#define IO_APIC_SLOT_SIZE 1024 + #define APIC_ID 0x20 #define APIC_LVR 0x30 #define APIC_LVR_MASK 0xFF00FF +#define APIC_LVR_DIRECTED_EOI (1 << 24) #define GET_APIC_VERSION(x) ((x) & 0xFFu) #define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu) #ifdef CONFIG_X86_32 @@ -41,6 +48,7 @@ #define APIC_DFR_CLUSTER 0x0FFFFFFFul #define APIC_DFR_FLAT 0xFFFFFFFFul #define APIC_SPIV 0xF0 +#define APIC_SPIV_DIRECTED_EOI (1 << 12) #define APIC_SPIV_FOCUS_DISABLED (1 << 9) #define APIC_SPIV_APIC_ENABLED (1 << 8) #define APIC_ISR 0x100 diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h deleted file mode 100644 index 82f613c607c..00000000000 --- a/arch/x86/include/asm/apicnum.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _ASM_X86_APICNUM_H -#define _ASM_X86_APICNUM_H - -/* define MAX_IO_APICS */ -#ifdef CONFIG_X86_32 -# define MAX_IO_APICS 64 -#else -# define MAX_IO_APICS 128 -# define MAX_LOCAL_APIC 32768 -#endif - -#endif /* _ASM_X86_APICNUM_H */ diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 1724e8de317..6be33d83c71 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -85,7 +85,8 @@ struct efi_info { struct boot_params { struct screen_info screen_info; /* 0x000 */ struct apm_bios_info apm_bios_info; /* 0x040 */ - __u8 _pad2[12]; /* 0x054 */ + __u8 _pad2[4]; /* 0x054 */ + __u64 tboot_addr; /* 0x058 */ struct ist_info ist_info; /* 0x060 */ __u8 _pad3[16]; /* 0x070 */ __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ @@ -109,4 +110,14 @@ struct boot_params { __u8 _pad9[276]; /* 0xeec */ } __attribute__((packed)); +enum { + X86_SUBARCH_PC = 0, + X86_SUBARCH_LGUEST, + X86_SUBARCH_XEN, + X86_SUBARCH_MRST, + X86_NR_SUBARCHS, +}; + + + #endif /* _ASM_X86_BOOTPARAM_H */ diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index d9cf1cd156d..f654d1bb17f 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -22,14 +22,14 @@ do { \ ".popsection" \ : : "i" (__FILE__), "i" (__LINE__), \ "i" (sizeof(struct bug_entry))); \ - for (;;) ; \ + unreachable(); \ } while (0) #else #define BUG() \ do { \ asm volatile("ud2"); \ - for (;;) ; \ + unreachable(); \ } while (0) #endif diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h index 5d367caa0e3..549860d3be8 100644 --- a/arch/x86/include/asm/cache.h +++ b/arch/x86/include/asm/cache.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_CACHE_H #define _ASM_X86_CACHE_H +#include <linux/linkage.h> + /* L1 cache line size */ #define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT) #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) @@ -13,7 +15,7 @@ #ifdef CONFIG_SMP #define __cacheline_aligned_in_smp \ __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \ - __attribute__((__section__(".data.page_aligned"))) + __page_aligned_data #endif #endif diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index e55dfc1ad45..b54f6afe7ec 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -43,8 +43,58 @@ static inline void copy_from_user_page(struct vm_area_struct *vma, memcpy(dst, src, len); } -#define PG_non_WB PG_arch_1 -PAGEFLAG(NonWB, non_WB) +#define PG_WC PG_arch_1 +PAGEFLAG(WC, WC) + +#ifdef CONFIG_X86_PAT +/* + * X86 PAT uses page flags WC and Uncached together to keep track of + * memory type of pages that have backing page struct. X86 PAT supports 3 + * different memory types, _PAGE_CACHE_WB, _PAGE_CACHE_WC and + * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not + * been changed from its default (value of -1 used to denote this). + * Note we do not support _PAGE_CACHE_UC here. + * + * Caller must hold memtype_lock for atomicity. + */ +static inline unsigned long get_page_memtype(struct page *pg) +{ + if (!PageUncached(pg) && !PageWC(pg)) + return -1; + else if (!PageUncached(pg) && PageWC(pg)) + return _PAGE_CACHE_WC; + else if (PageUncached(pg) && !PageWC(pg)) + return _PAGE_CACHE_UC_MINUS; + else + return _PAGE_CACHE_WB; +} + +static inline void set_page_memtype(struct page *pg, unsigned long memtype) +{ + switch (memtype) { + case _PAGE_CACHE_WC: + ClearPageUncached(pg); + SetPageWC(pg); + break; + case _PAGE_CACHE_UC_MINUS: + SetPageUncached(pg); + ClearPageWC(pg); + break; + case _PAGE_CACHE_WB: + SetPageUncached(pg); + SetPageWC(pg); + break; + default: + case -1: + ClearPageUncached(pg); + ClearPageWC(pg); + break; + } +} +#else +static inline unsigned long get_page_memtype(struct page *pg) { return -1; } +static inline void set_page_memtype(struct page *pg, unsigned long memtype) { } +#endif /* * The set_memory_* API can be used to change various attributes of a virtual diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h index b03bedb62aa..0918654305a 100644 --- a/arch/x86/include/asm/calgary.h +++ b/arch/x86/include/asm/calgary.h @@ -62,10 +62,8 @@ struct cal_chipset_ops { extern int use_calgary; #ifdef CONFIG_CALGARY_IOMMU -extern int calgary_iommu_init(void); extern void detect_calgary(void); #else -static inline int calgary_iommu_init(void) { return 1; } static inline void detect_calgary(void) { return; } #endif diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index 7c5ef8b14d9..46fc474fd81 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -161,7 +161,8 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr, "adcl $0, %0 ;\n" : "=&r" (sum) : "r" (saddr), "r" (daddr), - "r" (htonl(len)), "r" (htonl(proto)), "0" (sum)); + "r" (htonl(len)), "r" (htonl(proto)), "0" (sum) + : "memory"); return csum_fold(sum); } diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 82ceb788a98..ffb9bb6b6c3 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -8,14 +8,50 @@ * you need to test for the feature in boot_cpu_data. */ -#define xchg(ptr, v) \ - ((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), sizeof(*(ptr)))) +extern void __xchg_wrong_size(void); + +/* + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway + * Note 2: xchg has side effect, so that attribute volatile is necessary, + * but generally the primitive is invalid, *ptr is output argument. --ANK + */ struct __xchg_dummy { unsigned long a[100]; }; #define __xg(x) ((struct __xchg_dummy *)(x)) +#define __xchg(x, ptr, size) \ +({ \ + __typeof(*(ptr)) __x = (x); \ + switch (size) { \ + case 1: \ + asm volatile("xchgb %b0,%1" \ + : "=q" (__x) \ + : "m" (*__xg(ptr)), "0" (__x) \ + : "memory"); \ + break; \ + case 2: \ + asm volatile("xchgw %w0,%1" \ + : "=r" (__x) \ + : "m" (*__xg(ptr)), "0" (__x) \ + : "memory"); \ + break; \ + case 4: \ + asm volatile("xchgl %0,%1" \ + : "=r" (__x) \ + : "m" (*__xg(ptr)), "0" (__x) \ + : "memory"); \ + break; \ + default: \ + __xchg_wrong_size(); \ + } \ + __x; \ +}) + +#define xchg(ptr, v) \ + __xchg((v), (ptr), sizeof(*ptr)) + /* * The semantics of XCHGCMP8B are a bit strange, this is why * there is a loop and the loading of %%eax and %%edx has to @@ -71,57 +107,63 @@ static inline void __set_64bit_var(unsigned long long *ptr, (unsigned int)((value) >> 32)) \ : __set_64bit(ptr, ll_low((value)), ll_high((value)))) -/* - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway - * Note 2: xchg has side effect, so that attribute volatile is necessary, - * but generally the primitive is invalid, *ptr is output argument. --ANK - */ -static inline unsigned long __xchg(unsigned long x, volatile void *ptr, - int size) -{ - switch (size) { - case 1: - asm volatile("xchgb %b0,%1" - : "=q" (x) - : "m" (*__xg(ptr)), "0" (x) - : "memory"); - break; - case 2: - asm volatile("xchgw %w0,%1" - : "=r" (x) - : "m" (*__xg(ptr)), "0" (x) - : "memory"); - break; - case 4: - asm volatile("xchgl %0,%1" - : "=r" (x) - : "m" (*__xg(ptr)), "0" (x) - : "memory"); - break; - } - return x; -} +extern void __cmpxchg_wrong_size(void); /* * Atomic compare and exchange. Compare OLD with MEM, if identical, * store NEW in MEM. Return the initial value in MEM. Success is * indicated by comparing RETURN with OLD. */ +#define __raw_cmpxchg(ptr, old, new, size, lock) \ +({ \ + __typeof__(*(ptr)) __ret; \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) __new = (new); \ + switch (size) { \ + case 1: \ + asm volatile(lock "cmpxchgb %b1,%2" \ + : "=a"(__ret) \ + : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ + : "memory"); \ + break; \ + case 2: \ + asm volatile(lock "cmpxchgw %w1,%2" \ + : "=a"(__ret) \ + : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + : "memory"); \ + break; \ + case 4: \ + asm volatile(lock "cmpxchgl %1,%2" \ + : "=a"(__ret) \ + : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + : "memory"); \ + break; \ + default: \ + __cmpxchg_wrong_size(); \ + } \ + __ret; \ +}) + +#define __cmpxchg(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) + +#define __sync_cmpxchg(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), "lock; ") + +#define __cmpxchg_local(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), "") #ifdef CONFIG_X86_CMPXCHG #define __HAVE_ARCH_CMPXCHG 1 -#define cmpxchg(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \ - (unsigned long)(n), \ - sizeof(*(ptr)))) -#define sync_cmpxchg(ptr, o, n) \ - ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \ - (unsigned long)(n), \ - sizeof(*(ptr)))) -#define cmpxchg_local(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \ - (unsigned long)(n), \ - sizeof(*(ptr)))) + +#define cmpxchg(ptr, old, new) \ + __cmpxchg((ptr), (old), (new), sizeof(*ptr)) + +#define sync_cmpxchg(ptr, old, new) \ + __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr)) + +#define cmpxchg_local(ptr, old, new) \ + __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) #endif #ifdef CONFIG_X86_CMPXCHG64 @@ -133,94 +175,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, (unsigned long long)(n))) #endif -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - asm volatile(LOCK_PREFIX "cmpxchgl %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; -} - -/* - * Always use locked operations when touching memory shared with a - * hypervisor, since the system may be SMP even if the guest kernel - * isn't. - */ -static inline unsigned long __sync_cmpxchg(volatile void *ptr, - unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - asm volatile("lock; cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - asm volatile("lock; cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - asm volatile("lock; cmpxchgl %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; -} - -static inline unsigned long __cmpxchg_local(volatile void *ptr, - unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - asm volatile("cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - asm volatile("cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - asm volatile("cmpxchgl %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; -} - static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old, unsigned long long new) @@ -312,19 +266,23 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64); -#define cmpxchg64(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - if (likely(boot_cpu_data.x86 > 4)) \ - __ret = (__typeof__(*(ptr)))__cmpxchg64((ptr), \ - (unsigned long long)(o), \ - (unsigned long long)(n)); \ - else \ - __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \ - (unsigned long long)(o), \ - (unsigned long long)(n)); \ - __ret; \ -}) +#define cmpxchg64(ptr, o, n) \ +({ \ + __typeof__(*(ptr)) __ret; \ + __typeof__(*(ptr)) __old = (o); \ + __typeof__(*(ptr)) __new = (n); \ + alternative_io("call cmpxchg8b_emu", \ + "lock; cmpxchg8b (%%esi)" , \ + X86_FEATURE_CX8, \ + "=A" (__ret), \ + "S" ((ptr)), "0" (__old), \ + "b" ((unsigned int)__new), \ + "c" ((unsigned int)(__new>>32)) \ + : "memory"); \ + __ret; }) + + + #define cmpxchg64_local(ptr, o, n) \ ({ \ __typeof__(*(ptr)) __ret; \ diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 52de72e0de8..485ae415fae 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -3,9 +3,6 @@ #include <asm/alternative.h> /* Provides LOCK_PREFIX */ -#define xchg(ptr, v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v), \ - (ptr), sizeof(*(ptr)))) - #define __xg(x) ((volatile long *)(x)) static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) @@ -15,167 +12,118 @@ static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) #define _set_64bit set_64bit +extern void __xchg_wrong_size(void); +extern void __cmpxchg_wrong_size(void); + /* * Note: no "lock" prefix even on SMP: xchg always implies lock anyway * Note 2: xchg has side effect, so that attribute volatile is necessary, * but generally the primitive is invalid, *ptr is output argument. --ANK */ -static inline unsigned long __xchg(unsigned long x, volatile void *ptr, - int size) -{ - switch (size) { - case 1: - asm volatile("xchgb %b0,%1" - : "=q" (x) - : "m" (*__xg(ptr)), "0" (x) - : "memory"); - break; - case 2: - asm volatile("xchgw %w0,%1" - : "=r" (x) - : "m" (*__xg(ptr)), "0" (x) - : "memory"); - break; - case 4: - asm volatile("xchgl %k0,%1" - : "=r" (x) - : "m" (*__xg(ptr)), "0" (x) - : "memory"); - break; - case 8: - asm volatile("xchgq %0,%1" - : "=r" (x) - : "m" (*__xg(ptr)), "0" (x) - : "memory"); - break; - } - return x; -} +#define __xchg(x, ptr, size) \ +({ \ + __typeof(*(ptr)) __x = (x); \ + switch (size) { \ + case 1: \ + asm volatile("xchgb %b0,%1" \ + : "=q" (__x) \ + : "m" (*__xg(ptr)), "0" (__x) \ + : "memory"); \ + break; \ + case 2: \ + asm volatile("xchgw %w0,%1" \ + : "=r" (__x) \ + : "m" (*__xg(ptr)), "0" (__x) \ + : "memory"); \ + break; \ + case 4: \ + asm volatile("xchgl %k0,%1" \ + : "=r" (__x) \ + : "m" (*__xg(ptr)), "0" (__x) \ + : "memory"); \ + break; \ + case 8: \ + asm volatile("xchgq %0,%1" \ + : "=r" (__x) \ + : "m" (*__xg(ptr)), "0" (__x) \ + : "memory"); \ + break; \ + default: \ + __xchg_wrong_size(); \ + } \ + __x; \ +}) + +#define xchg(ptr, v) \ + __xchg((v), (ptr), sizeof(*ptr)) + +#define __HAVE_ARCH_CMPXCHG 1 /* * Atomic compare and exchange. Compare OLD with MEM, if identical, * store NEW in MEM. Return the initial value in MEM. Success is * indicated by comparing RETURN with OLD. */ +#define __raw_cmpxchg(ptr, old, new, size, lock) \ +({ \ + __typeof__(*(ptr)) __ret; \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) __new = (new); \ + switch (size) { \ + case 1: \ + asm volatile(lock "cmpxchgb %b1,%2" \ + : "=a"(__ret) \ + : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ + : "memory"); \ + break; \ + case 2: \ + asm volatile(lock "cmpxchgw %w1,%2" \ + : "=a"(__ret) \ + : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + : "memory"); \ + break; \ + case 4: \ + asm volatile(lock "cmpxchgl %k1,%2" \ + : "=a"(__ret) \ + : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + : "memory"); \ + break; \ + case 8: \ + asm volatile(lock "cmpxchgq %1,%2" \ + : "=a"(__ret) \ + : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + : "memory"); \ + break; \ + default: \ + __cmpxchg_wrong_size(); \ + } \ + __ret; \ +}) -#define __HAVE_ARCH_CMPXCHG 1 +#define __cmpxchg(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 8: - asm volatile(LOCK_PREFIX "cmpxchgq %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; -} +#define __sync_cmpxchg(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), "lock; ") -/* - * Always use locked operations when touching memory shared with a - * hypervisor, since the system may be SMP even if the guest kernel - * isn't. - */ -static inline unsigned long __sync_cmpxchg(volatile void *ptr, - unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - asm volatile("lock; cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - asm volatile("lock; cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - asm volatile("lock; cmpxchgl %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; -} +#define __cmpxchg_local(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), "") -static inline unsigned long __cmpxchg_local(volatile void *ptr, - unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - asm volatile("cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - asm volatile("cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - asm volatile("cmpxchgl %k1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 8: - asm volatile("cmpxchgq %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; -} +#define cmpxchg(ptr, old, new) \ + __cmpxchg((ptr), (old), (new), sizeof(*ptr)) + +#define sync_cmpxchg(ptr, old, new) \ + __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr)) + +#define cmpxchg_local(ptr, old, new) \ + __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) -#define cmpxchg(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \ - (unsigned long)(n), sizeof(*(ptr)))) #define cmpxchg64(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ cmpxchg((ptr), (o), (n)); \ }) -#define cmpxchg_local(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \ - (unsigned long)(n), \ - sizeof(*(ptr)))) -#define sync_cmpxchg(ptr, o, n) \ - ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \ - (unsigned long)(n), \ - sizeof(*(ptr)))) + #define cmpxchg64_local(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 847fee6493a..9cfc88b9774 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -96,6 +96,7 @@ #define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ #define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ #define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */ +#define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 3ea6f37be9e..8240f76b531 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -18,6 +18,7 @@ #define DR_TRAP1 (0x2) /* db1 */ #define DR_TRAP2 (0x4) /* db2 */ #define DR_TRAP3 (0x8) /* db3 */ +#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3) #define DR_STEP (0x4000) /* single-step */ #define DR_SWITCH (0x8000) /* task switch */ @@ -49,6 +50,8 @@ #define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */ #define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */ +#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */ +#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */ #define DR_ENABLE_SIZE 2 /* 2 enable bits per register */ #define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */ @@ -67,4 +70,34 @@ #define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */ #define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */ +/* + * HW breakpoint additions + */ +#ifdef __KERNEL__ + +DECLARE_PER_CPU(unsigned long, cpu_dr7); + +static inline void hw_breakpoint_disable(void) +{ + /* Zero the control register for HW Breakpoint */ + set_debugreg(0UL, 7); + + /* Zero-out the individual HW breakpoint address registers */ + set_debugreg(0UL, 0); + set_debugreg(0UL, 1); + set_debugreg(0UL, 2); + set_debugreg(0UL, 3); +} + +static inline int hw_breakpoint_active(void) +{ + return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; +} + +extern void aout_dump_debugregs(struct user *dump); + +extern void hw_breakpoint_restore(void); + +#endif /* __KERNEL__ */ + #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index e8de2f6f5ca..617bd56b307 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -288,7 +288,7 @@ static inline void load_LDT(mm_context_t *pc) static inline unsigned long get_desc_base(const struct desc_struct *desc) { - return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); + return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24)); } static inline void set_desc_base(struct desc_struct *desc, unsigned long base) diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 4994a20acbc..029f230ab63 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -8,9 +8,12 @@ struct dev_archdata { #ifdef CONFIG_X86_64 struct dma_map_ops *dma_ops; #endif -#ifdef CONFIG_DMAR +#if defined(CONFIG_DMAR) || defined(CONFIG_AMD_IOMMU) void *iommu; /* hook for IOMMU specific extension */ #endif }; +struct pdev_archdata { +}; + #endif /* _ASM_X86_DEVICE_H */ diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 0ee770d23d0..0f6c02f3b7d 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -14,7 +14,14 @@ #include <asm/swiotlb.h> #include <asm-generic/dma-coherent.h> -extern dma_addr_t bad_dma_address; +#ifdef CONFIG_ISA +# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) +#else +# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32) +#endif + +#define DMA_ERROR_CODE 0 + extern int iommu_merge; extern struct device x86_dma_fallback_dev; extern int panic_on_overflow; @@ -42,7 +49,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) if (ops->mapping_error) return ops->mapping_error(dev, dma_addr); - return (dma_addr == bad_dma_address); + return (dma_addr == DMA_ERROR_CODE); } #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) @@ -124,10 +131,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) return memory; - if (!dev) { + if (!dev) dev = &x86_dma_fallback_dev; - gfp |= GFP_DMA; - } if (!is_device_dma_capable(dev)) return NULL; diff --git a/arch/x86/include/asm/do_timer.h b/arch/x86/include/asm/do_timer.h deleted file mode 100644 index 23ecda0b28a..00000000000 --- a/arch/x86/include/asm/do_timer.h +++ /dev/null @@ -1,16 +0,0 @@ -/* defines for inline arch setup functions */ -#include <linux/clockchips.h> - -#include <asm/i8259.h> -#include <asm/i8253.h> - -/** - * do_timer_interrupt_hook - hook into timer tick - * - * Call the pit clock event handler. see asm/i8253.h - **/ - -static inline void do_timer_interrupt_hook(void) -{ - global_clock_event->event_handler(global_clock_event); -} diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 7ecba4d8508..40b4e614fe7 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -126,8 +126,6 @@ extern void e820_reserve_resources(void); extern void e820_reserve_resources_late(void); extern void setup_memory_map(void); extern char *default_machine_specific_memory_setup(void); -extern char *machine_specific_memory_setup(void); -extern char *memory_setup(void); #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 83c1bc8d2e8..456a304b817 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -299,6 +299,8 @@ do { \ #ifdef CONFIG_X86_32 +#define STACK_RND_MASK (0x7ff) + #define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO)) #define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled) diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index ff8cbfa0785..f5693c81a1d 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -49,7 +49,7 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) #endif @@ -61,7 +61,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR) #endif diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 7b2d71df39a..14f9890eb49 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -132,6 +132,9 @@ enum fixed_addresses { #ifdef CONFIG_X86_32 FIX_WP_TEST, #endif +#ifdef CONFIG_INTEL_TXT + FIX_TBOOT_BASE, +#endif __end_of_fixed_addresses }; diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 6cfdafa409d..4ac5b0f33fc 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -35,8 +35,7 @@ extern int gart_iommu_aperture_allowed; extern int gart_iommu_aperture_disabled; extern void early_gart_iommu_check(void); -extern void gart_iommu_init(void); -extern void gart_iommu_shutdown(void); +extern int gart_iommu_init(void); extern void __init gart_parse_options(char *); extern void gart_iommu_hole_init(void); @@ -48,12 +47,6 @@ extern void gart_iommu_hole_init(void); static inline void early_gart_iommu_check(void) { } -static inline void gart_iommu_init(void) -{ -} -static inline void gart_iommu_shutdown(void) -{ -} static inline void gart_parse_options(char *options) { } diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 82e3e8f0104..108eb6fd1ae 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -20,11 +20,11 @@ typedef struct { unsigned int irq_call_count; unsigned int irq_tlb_count; #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR unsigned int irq_thermal_count; -# ifdef CONFIG_X86_MCE_THRESHOLD +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD unsigned int irq_threshold_count; -# endif #endif } ____cacheline_aligned irq_cpustat_t; diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h new file mode 100644 index 00000000000..0675a7c4c20 --- /dev/null +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -0,0 +1,73 @@ +#ifndef _I386_HW_BREAKPOINT_H +#define _I386_HW_BREAKPOINT_H + +#ifdef __KERNEL__ +#define __ARCH_HW_BREAKPOINT_H + +/* + * The name should probably be something dealt in + * a higher level. While dealing with the user + * (display/resolving) + */ +struct arch_hw_breakpoint { + char *name; /* Contains name of the symbol to set bkpt */ + unsigned long address; + u8 len; + u8 type; +}; + +#include <linux/kdebug.h> +#include <linux/percpu.h> +#include <linux/list.h> + +/* Available HW breakpoint length encodings */ +#define X86_BREAKPOINT_LEN_1 0x40 +#define X86_BREAKPOINT_LEN_2 0x44 +#define X86_BREAKPOINT_LEN_4 0x4c +#define X86_BREAKPOINT_LEN_EXECUTE 0x40 + +#ifdef CONFIG_X86_64 +#define X86_BREAKPOINT_LEN_8 0x48 +#endif + +/* Available HW breakpoint type encodings */ + +/* trigger on instruction execute */ +#define X86_BREAKPOINT_EXECUTE 0x80 +/* trigger on memory write */ +#define X86_BREAKPOINT_WRITE 0x81 +/* trigger on memory read or write */ +#define X86_BREAKPOINT_RW 0x83 + +/* Total number of available HW breakpoint registers */ +#define HBP_NUM 4 + +struct perf_event; +struct pmu; + +extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len); +extern int arch_validate_hwbkpt_settings(struct perf_event *bp, + struct task_struct *tsk); +extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, + unsigned long val, void *data); + + +int arch_install_hw_breakpoint(struct perf_event *bp); +void arch_uninstall_hw_breakpoint(struct perf_event *bp); +void hw_breakpoint_pmu_read(struct perf_event *bp); +void hw_breakpoint_pmu_unthrottle(struct perf_event *bp); + +extern void +arch_fill_perf_breakpoint(struct perf_event *bp); + +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type); +int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type); + +extern int arch_bp_generic_fields(int x86_len, int x86_type, + int *gen_len, int *gen_type); + +extern struct pmu perf_ops_bp; + +#endif /* __KERNEL__ */ +#endif /* _I386_HW_BREAKPOINT_H */ + diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index ba180d93b08..6e124269fd4 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -79,14 +79,32 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, int ioapic, int ioapic_pin, int trigger, int polarity) { - irq_attr->ioapic = ioapic; - irq_attr->ioapic_pin = ioapic_pin; - irq_attr->trigger = trigger; - irq_attr->polarity = polarity; + irq_attr->ioapic = ioapic; + irq_attr->ioapic_pin = ioapic_pin; + irq_attr->trigger = trigger; + irq_attr->polarity = polarity; } -extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, - struct io_apic_irq_attr *irq_attr); +/* + * This is performance-critical, we want to do it O(1) + * + * Most irqs are mapped 1:1 with pins. + */ +struct irq_cfg { + struct irq_pin_list *irq_2_pin; + cpumask_var_t domain; + cpumask_var_t old_domain; + u8 vector; + u8 move_in_progress : 1; +}; + +extern struct irq_cfg *irq_cfg(unsigned int); +extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); +extern void send_cleanup_vector(struct irq_cfg *); + +struct irq_desc; +extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *); +extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr); extern void setup_ioapic_dest(void); extern void enable_IO_APIC(void); diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 369f5c5d09a..b78c0941e42 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -20,7 +20,7 @@ #ifndef ASM_X86__HYPERVISOR_H #define ASM_X86__HYPERVISOR_H -extern unsigned long get_hypervisor_tsc_freq(void); extern void init_hypervisor(struct cpuinfo_x86 *c); +extern void init_hypervisor_platform(void); #endif diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h new file mode 100644 index 00000000000..205b063e3e3 --- /dev/null +++ b/arch/x86/include/asm/inat.h @@ -0,0 +1,220 @@ +#ifndef _ASM_X86_INAT_H +#define _ASM_X86_INAT_H +/* + * x86 instruction attributes + * + * Written by Masami Hiramatsu <mhiramat@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +#include <asm/inat_types.h> + +/* + * Internal bits. Don't use bitmasks directly, because these bits are + * unstable. You should use checking functions. + */ + +#define INAT_OPCODE_TABLE_SIZE 256 +#define INAT_GROUP_TABLE_SIZE 8 + +/* Legacy last prefixes */ +#define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */ +#define INAT_PFX_REPE 2 /* 0xF3 */ /* LPFX2 */ +#define INAT_PFX_REPNE 3 /* 0xF2 */ /* LPFX3 */ +/* Other Legacy prefixes */ +#define INAT_PFX_LOCK 4 /* 0xF0 */ +#define INAT_PFX_CS 5 /* 0x2E */ +#define INAT_PFX_DS 6 /* 0x3E */ +#define INAT_PFX_ES 7 /* 0x26 */ +#define INAT_PFX_FS 8 /* 0x64 */ +#define INAT_PFX_GS 9 /* 0x65 */ +#define INAT_PFX_SS 10 /* 0x36 */ +#define INAT_PFX_ADDRSZ 11 /* 0x67 */ +/* x86-64 REX prefix */ +#define INAT_PFX_REX 12 /* 0x4X */ +/* AVX VEX prefixes */ +#define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */ +#define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */ + +#define INAT_LSTPFX_MAX 3 +#define INAT_LGCPFX_MAX 11 + +/* Immediate size */ +#define INAT_IMM_BYTE 1 +#define INAT_IMM_WORD 2 +#define INAT_IMM_DWORD 3 +#define INAT_IMM_QWORD 4 +#define INAT_IMM_PTR 5 +#define INAT_IMM_VWORD32 6 +#define INAT_IMM_VWORD 7 + +/* Legacy prefix */ +#define INAT_PFX_OFFS 0 +#define INAT_PFX_BITS 4 +#define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1) +#define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS) +/* Escape opcodes */ +#define INAT_ESC_OFFS (INAT_PFX_OFFS + INAT_PFX_BITS) +#define INAT_ESC_BITS 2 +#define INAT_ESC_MAX ((1 << INAT_ESC_BITS) - 1) +#define INAT_ESC_MASK (INAT_ESC_MAX << INAT_ESC_OFFS) +/* Group opcodes (1-16) */ +#define INAT_GRP_OFFS (INAT_ESC_OFFS + INAT_ESC_BITS) +#define INAT_GRP_BITS 5 +#define INAT_GRP_MAX ((1 << INAT_GRP_BITS) - 1) +#define INAT_GRP_MASK (INAT_GRP_MAX << INAT_GRP_OFFS) +/* Immediates */ +#define INAT_IMM_OFFS (INAT_GRP_OFFS + INAT_GRP_BITS) +#define INAT_IMM_BITS 3 +#define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS) +/* Flags */ +#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS) +#define INAT_MODRM (1 << (INAT_FLAG_OFFS)) +#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 1)) +#define INAT_SCNDIMM (1 << (INAT_FLAG_OFFS + 2)) +#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3)) +#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4)) +#define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) +#define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) +/* Attribute making macros for attribute tables */ +#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) +#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) +#define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) +#define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) + +/* Attribute search APIs */ +extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); +extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, + insn_byte_t last_pfx, + insn_attr_t esc_attr); +extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, + insn_byte_t last_pfx, + insn_attr_t esc_attr); +extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, + insn_byte_t vex_m, + insn_byte_t vex_pp); + +/* Attribute checking functions */ +static inline int inat_is_legacy_prefix(insn_attr_t attr) +{ + attr &= INAT_PFX_MASK; + return attr && attr <= INAT_LGCPFX_MAX; +} + +static inline int inat_is_address_size_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ; +} + +static inline int inat_is_operand_size_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ; +} + +static inline int inat_is_rex_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_REX; +} + +static inline int inat_last_prefix_id(insn_attr_t attr) +{ + if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX) + return 0; + else + return attr & INAT_PFX_MASK; +} + +static inline int inat_is_vex_prefix(insn_attr_t attr) +{ + attr &= INAT_PFX_MASK; + return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3; +} + +static inline int inat_is_vex3_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3; +} + +static inline int inat_is_escape(insn_attr_t attr) +{ + return attr & INAT_ESC_MASK; +} + +static inline int inat_escape_id(insn_attr_t attr) +{ + return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS; +} + +static inline int inat_is_group(insn_attr_t attr) +{ + return attr & INAT_GRP_MASK; +} + +static inline int inat_group_id(insn_attr_t attr) +{ + return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS; +} + +static inline int inat_group_common_attribute(insn_attr_t attr) +{ + return attr & ~INAT_GRP_MASK; +} + +static inline int inat_has_immediate(insn_attr_t attr) +{ + return attr & INAT_IMM_MASK; +} + +static inline int inat_immediate_size(insn_attr_t attr) +{ + return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS; +} + +static inline int inat_has_modrm(insn_attr_t attr) +{ + return attr & INAT_MODRM; +} + +static inline int inat_is_force64(insn_attr_t attr) +{ + return attr & INAT_FORCE64; +} + +static inline int inat_has_second_immediate(insn_attr_t attr) +{ + return attr & INAT_SCNDIMM; +} + +static inline int inat_has_moffset(insn_attr_t attr) +{ + return attr & INAT_MOFFSET; +} + +static inline int inat_has_variant(insn_attr_t attr) +{ + return attr & INAT_VARIANT; +} + +static inline int inat_accept_vex(insn_attr_t attr) +{ + return attr & INAT_VEXOK; +} + +static inline int inat_must_vex(insn_attr_t attr) +{ + return attr & INAT_VEXONLY; +} +#endif diff --git a/arch/x86/include/asm/inat_types.h b/arch/x86/include/asm/inat_types.h new file mode 100644 index 00000000000..cb3c20ce39c --- /dev/null +++ b/arch/x86/include/asm/inat_types.h @@ -0,0 +1,29 @@ +#ifndef _ASM_X86_INAT_TYPES_H +#define _ASM_X86_INAT_TYPES_H +/* + * x86 instruction attributes + * + * Written by Masami Hiramatsu <mhiramat@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ + +/* Instruction attributes */ +typedef unsigned int insn_attr_t; +typedef unsigned char insn_byte_t; +typedef signed int insn_value_t; + +#endif diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h new file mode 100644 index 00000000000..96c2e0ad04c --- /dev/null +++ b/arch/x86/include/asm/insn.h @@ -0,0 +1,184 @@ +#ifndef _ASM_X86_INSN_H +#define _ASM_X86_INSN_H +/* + * x86 instruction analysis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +/* insn_attr_t is defined in inat.h */ +#include <asm/inat.h> + +struct insn_field { + union { + insn_value_t value; + insn_byte_t bytes[4]; + }; + /* !0 if we've run insn_get_xxx() for this field */ + unsigned char got; + unsigned char nbytes; +}; + +struct insn { + struct insn_field prefixes; /* + * Prefixes + * prefixes.bytes[3]: last prefix + */ + struct insn_field rex_prefix; /* REX prefix */ + struct insn_field vex_prefix; /* VEX prefix */ + struct insn_field opcode; /* + * opcode.bytes[0]: opcode1 + * opcode.bytes[1]: opcode2 + * opcode.bytes[2]: opcode3 + */ + struct insn_field modrm; + struct insn_field sib; + struct insn_field displacement; + union { + struct insn_field immediate; + struct insn_field moffset1; /* for 64bit MOV */ + struct insn_field immediate1; /* for 64bit imm or off16/32 */ + }; + union { + struct insn_field moffset2; /* for 64bit MOV */ + struct insn_field immediate2; /* for 64bit imm or seg16 */ + }; + + insn_attr_t attr; + unsigned char opnd_bytes; + unsigned char addr_bytes; + unsigned char length; + unsigned char x86_64; + + const insn_byte_t *kaddr; /* kernel address of insn to analyze */ + const insn_byte_t *next_byte; +}; + +#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) +#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) +#define X86_MODRM_RM(modrm) ((modrm) & 0x07) + +#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6) +#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3) +#define X86_SIB_BASE(sib) ((sib) & 0x07) + +#define X86_REX_W(rex) ((rex) & 8) +#define X86_REX_R(rex) ((rex) & 4) +#define X86_REX_X(rex) ((rex) & 2) +#define X86_REX_B(rex) ((rex) & 1) + +/* VEX bit flags */ +#define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */ +#define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */ +#define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */ +#define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */ +#define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */ +/* VEX bit fields */ +#define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */ +#define X86_VEX2_M 1 /* VEX2.M always 1 */ +#define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ +#define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ +#define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ + +/* The last prefix is needed for two-byte and three-byte opcodes */ +static inline insn_byte_t insn_last_prefix(struct insn *insn) +{ + return insn->prefixes.bytes[3]; +} + +extern void insn_init(struct insn *insn, const void *kaddr, int x86_64); +extern void insn_get_prefixes(struct insn *insn); +extern void insn_get_opcode(struct insn *insn); +extern void insn_get_modrm(struct insn *insn); +extern void insn_get_sib(struct insn *insn); +extern void insn_get_displacement(struct insn *insn); +extern void insn_get_immediate(struct insn *insn); +extern void insn_get_length(struct insn *insn); + +/* Attribute will be determined after getting ModRM (for opcode groups) */ +static inline void insn_get_attribute(struct insn *insn) +{ + insn_get_modrm(insn); +} + +/* Instruction uses RIP-relative addressing */ +extern int insn_rip_relative(struct insn *insn); + +/* Init insn for kernel text */ +static inline void kernel_insn_init(struct insn *insn, const void *kaddr) +{ +#ifdef CONFIG_X86_64 + insn_init(insn, kaddr, 1); +#else /* CONFIG_X86_32 */ + insn_init(insn, kaddr, 0); +#endif +} + +static inline int insn_is_avx(struct insn *insn) +{ + if (!insn->prefixes.got) + insn_get_prefixes(insn); + return (insn->vex_prefix.value != 0); +} + +static inline insn_byte_t insn_vex_m_bits(struct insn *insn) +{ + if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ + return X86_VEX2_M; + else + return X86_VEX3_M(insn->vex_prefix.bytes[1]); +} + +static inline insn_byte_t insn_vex_p_bits(struct insn *insn) +{ + if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ + return X86_VEX_P(insn->vex_prefix.bytes[1]); + else + return X86_VEX_P(insn->vex_prefix.bytes[2]); +} + +/* Offset of each field from kaddr */ +static inline int insn_offset_rex_prefix(struct insn *insn) +{ + return insn->prefixes.nbytes; +} +static inline int insn_offset_vex_prefix(struct insn *insn) +{ + return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes; +} +static inline int insn_offset_opcode(struct insn *insn) +{ + return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes; +} +static inline int insn_offset_modrm(struct insn *insn) +{ + return insn_offset_opcode(insn) + insn->opcode.nbytes; +} +static inline int insn_offset_sib(struct insn *insn) +{ + return insn_offset_modrm(insn) + insn->modrm.nbytes; +} +static inline int insn_offset_displacement(struct insn *insn) +{ + return insn_offset_sib(insn) + insn->sib.nbytes; +} +static inline int insn_offset_immediate(struct insn *insn) +{ + return insn_offset_displacement(insn) + insn->displacement.nbytes; +} + +#endif /* _ASM_X86_INSN_H */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 85232d32fcb..7c7c16cde1f 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -143,6 +143,8 @@ extern int noioapicreroute; /* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ extern int timer_through_8259; +extern void io_apic_disable_legacy(void); + /* * If we use the IO-APIC for IRQ routing, disable automatic * assignment of PCI IRQ's. @@ -176,6 +178,7 @@ extern int setup_ioapic_entry(int apic, int irq, int polarity, int vector, int pin); extern void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e); +extern void setup_ioapic_ids_from_mpc(void); struct mp_ioapic_gsi{ int gsi_base; @@ -187,12 +190,14 @@ int mp_find_ioapic_pin(int ioapic, int gsi); void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); #else /* !CONFIG_X86_IO_APIC */ + #define io_apic_assign_pci_irqs 0 +#define setup_ioapic_ids_from_mpc x86_init_noop static const int timer_through_8259 = 0; static inline void ioapic_init_mappings(void) { } static inline void ioapic_insert_resources(void) { } - static inline void probe_nr_irqs_gsi(void) { } + #endif #endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index 0e9fe1d9d97..f35eb45d657 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -26,13 +26,16 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> -int -is_io_mapping_possible(resource_size_t base, unsigned long size); - void * iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); void iounmap_atomic(void *kvaddr, enum km_type type); +int +iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); + +void +iomap_free(resource_size_t base, unsigned long size); + #endif /* _ASM_X86_IOMAP_H */ diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index fd6d21bbee6..345c99cef15 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -1,8 +1,6 @@ #ifndef _ASM_X86_IOMMU_H #define _ASM_X86_IOMMU_H -extern void pci_iommu_shutdown(void); -extern void no_iommu_init(void); extern struct dma_map_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index f38481bcd45..ffd700ff5dc 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -34,10 +34,10 @@ static inline int irq_canonicalize(int irq) #ifdef CONFIG_HOTPLUG_CPU #include <linux/cpumask.h> extern void fixup_irqs(void); +extern void irq_force_complete_move(int); #endif extern void (*generic_interrupt_extension)(void); -extern void init_IRQ(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); @@ -47,4 +47,6 @@ extern unsigned int do_IRQ(struct pt_regs *regs); extern DECLARE_BITMAP(used_vectors, NR_VECTORS); extern int vector_used_by_percpu_irq(unsigned int vector); +extern void init_ISA_irqs(void); + #endif /* _ASM_X86_IRQ_H */ diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 125be8b1956..4a5fe914dc5 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -17,6 +17,8 @@ #define __KVM_HAVE_USER_NMI #define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_MSIX +#define __KVM_HAVE_MCE +#define __KVM_HAVE_PIT_STATE2 /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -236,6 +238,14 @@ struct kvm_pit_state { struct kvm_pit_channel_state channels[3]; }; +#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 + +struct kvm_pit_state2 { + struct kvm_pit_channel_state channels[3]; + __u32 flags; + __u32 reserved[9]; +}; + struct kvm_reinject_control { __u8 pit_reinject; __u8 reserved[31]; diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_emulate.h index b7ed2c42311..b7ed2c42311 100644 --- a/arch/x86/include/asm/kvm_x86_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index eabdc1cfab5..d83892226f7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -14,6 +14,7 @@ #include <linux/types.h> #include <linux/mm.h> #include <linux/mmu_notifier.h> +#include <linux/tracepoint.h> #include <linux/kvm.h> #include <linux/kvm_para.h> @@ -37,12 +38,14 @@ #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) -#define KVM_GUEST_CR0_MASK \ - (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \ - | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK \ + (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) #define KVM_VM_CR0_ALWAYS_ON \ - (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \ - | X86_CR0_MP) + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_GUEST_CR4_MASK \ (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) @@ -51,12 +54,12 @@ #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) -/* shadow tables are PAE even on non-PAE hosts */ -#define KVM_HPAGE_SHIFT 21 -#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT) -#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1)) - -#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE) +/* KVM Hugepage definitions for x86 */ +#define KVM_NR_PAGE_SIZES 3 +#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) +#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) +#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) +#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) #define DE_VECTOR 0 #define DB_VECTOR 1 @@ -120,6 +123,10 @@ enum kvm_reg { NR_VCPU_REGS }; +enum kvm_reg_ex { + VCPU_EXREG_PDPTR = NR_VCPU_REGS, +}; + enum { VCPU_SREG_ES, VCPU_SREG_CS, @@ -131,7 +138,7 @@ enum { VCPU_SREG_LDTR, }; -#include <asm/kvm_x86_emulate.h> +#include <asm/kvm_emulate.h> #define KVM_NR_MEM_OBJS 40 @@ -308,7 +315,6 @@ struct kvm_vcpu_arch { struct { gfn_t gfn; /* presumed gfn during guest pte update */ pfn_t pfn; /* pfn corresponding to that gfn */ - int largepage; unsigned long mmu_seq; } update_pte; @@ -334,16 +340,6 @@ struct kvm_vcpu_arch { u8 nr; } interrupt; - struct { - int vm86_active; - u8 save_iopl; - struct kvm_save_segment { - u16 selector; - unsigned long base; - u32 limit; - u32 ar; - } tr, es, ds, fs, gs; - } rmode; int halt_request; /* real mode on Intel only */ int cpuid_nent; @@ -366,13 +362,15 @@ struct kvm_vcpu_arch { u32 pat; int switch_db_regs; - unsigned long host_db[KVM_NR_DB_REGS]; - unsigned long host_dr6; - unsigned long host_dr7; unsigned long db[KVM_NR_DB_REGS]; unsigned long dr6; unsigned long dr7; unsigned long eff_db[KVM_NR_DB_REGS]; + + u64 mcg_cap; + u64 mcg_status; + u64 mcg_ctl; + u64 *mce_banks; }; struct kvm_mem_alias { @@ -409,6 +407,7 @@ struct kvm_arch{ struct page *ept_identity_pagetable; bool ept_identity_pagetable_done; + gpa_t ept_identity_map_addr; unsigned long irq_sources_bitmap; unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; @@ -526,6 +525,9 @@ struct kvm_x86_ops { int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + bool (*gb_page_enable)(void); + + const struct trace_print_flags *exit_reasons_str; }; extern struct kvm_x86_ops *kvm_x86_ops; @@ -618,6 +620,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, u32 error_code); +bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); int kvm_pic_set_irq(void *opaque, int irq, int level); @@ -752,8 +755,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); } -#define MSR_IA32_TIME_STAMP_COUNTER 0x010 - #define TSS_IOPB_BASE_OFFSET 0x66 #define TSS_BASE_SIZE 0x68 #define TSS_IOPB_SIZE (65536 / 8) @@ -795,6 +796,10 @@ asmlinkage void kvm_handle_fault_on_reboot(void); #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); +int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); +int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); +int kvm_cpu_get_interrupt(struct kvm_vcpu *v); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index b8a3305ae09..c584076a47f 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_KVM_PARA_H #define _ASM_X86_KVM_PARA_H +#include <linux/types.h> + /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It * should be used to determine that a VM is running under KVM. */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 5cdd8d100ec..858baa061cf 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -9,7 +9,7 @@ */ #define MCG_BANKCNT_MASK 0xff /* Number of Banks */ -#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ +#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */ #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ @@ -38,6 +38,14 @@ #define MCM_ADDR_MEM 3 /* memory address */ #define MCM_ADDR_GENERIC 7 /* generic */ +#define MCJ_CTX_MASK 3 +#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) +#define MCJ_CTX_RANDOM 0 /* inject context: random */ +#define MCJ_CTX_PROCESS 1 /* inject context: process */ +#define MCJ_CTX_IRQ 2 /* inject context: IRQ */ +#define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */ +#define MCJ_EXCEPTION 8 /* raise as exception */ + /* Fields are zero when not available */ struct mce { __u64 status; @@ -48,8 +56,8 @@ struct mce { __u64 tsc; /* cpu time stamp counter */ __u64 time; /* wall time_t when error was detected */ __u8 cpuvendor; /* cpu vendor as encoded in system.h */ - __u8 pad1; - __u16 pad2; + __u8 inject_flags; /* software inject flags */ + __u16 pad; __u32 cpuid; /* CPUID 1 EAX */ __u8 cs; /* code segment */ __u8 bank; /* machine check bank */ @@ -100,6 +108,8 @@ struct mce_log { #define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) #define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) +extern struct atomic_notifier_head x86_mce_decoder_chain; + #ifdef __KERNEL__ #include <linux/percpu.h> @@ -110,16 +120,11 @@ extern int mce_disabled; extern int mce_p5_enabled; #ifdef CONFIG_X86_MCE -void mcheck_init(struct cpuinfo_x86 *c); +int mcheck_init(void); +void mcheck_cpu_init(struct cpuinfo_x86 *c); #else -static inline void mcheck_init(struct cpuinfo_x86 *c) {} -#endif - -#ifdef CONFIG_X86_OLD_MCE -extern int nr_mce_banks; -void amd_mcheck_init(struct cpuinfo_x86 *c); -void intel_p4_mcheck_init(struct cpuinfo_x86 *c); -void intel_p6_mcheck_init(struct cpuinfo_x86 *c); +static inline int mcheck_init(void) { return 0; } +static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} #endif #ifdef CONFIG_X86_ANCIENT_MCE @@ -132,15 +137,18 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} static inline void enable_p5_mce(void) {} #endif +extern void (*x86_mce_decode_callback)(struct mce *m); + void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, mce_dev); /* - * To support more than 128 would need to escape the predefined - * Linux defined extended banks first. + * Maximum banks number. + * This is the limit of the current register layout on + * Intel CPUs. */ -#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) +#define MAX_NR_BANKS 32 #ifdef CONFIG_X86_MCE_INTEL extern int mce_cmci_disabled; @@ -208,10 +216,12 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); void intel_init_thermal(struct cpuinfo_x86 *c); -#ifdef CONFIG_X86_NEW_MCE void mce_log_therm_throt_event(__u64 status); + +#ifdef CONFIG_X86_THERMAL_VECTOR +extern void mcheck_intel_therm_init(void); #else -static inline void mce_log_therm_throt_event(__u64 status) {} +static inline void mcheck_intel_therm_init(void) { } #endif #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index f923203dc39..4a2d4e0c18d 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -37,12 +37,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, if (likely(prev != next)) { /* stop flush ipis for the previous mm */ - cpu_clear(cpu, prev->cpu_vm_mask); + cpumask_clear_cpu(cpu, mm_cpumask(prev)); #ifdef CONFIG_SMP percpu_write(cpu_tlbstate.state, TLBSTATE_OK); percpu_write(cpu_tlbstate.active_mm, next); #endif - cpu_set(cpu, next->cpu_vm_mask); + cpumask_set_cpu(cpu, mm_cpumask(next)); /* Re-load page tables */ load_cr3(next->pgd); @@ -58,7 +58,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, percpu_write(cpu_tlbstate.state, TLBSTATE_OK); BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); - if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { + if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index e2a1bb6d71e..61d90b1331c 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -4,6 +4,7 @@ #include <linux/init.h> #include <asm/mpspec_def.h> +#include <asm/x86_init.h> extern int apic_version[MAX_APICS]; extern int pic_mode; @@ -41,9 +42,6 @@ extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; #endif /* CONFIG_X86_64 */ -extern void early_find_smp_config(void); -extern void early_get_smp_config(void); - #if defined(CONFIG_MCA) || defined(CONFIG_EISA) extern int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif @@ -52,20 +50,55 @@ extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); extern unsigned int boot_cpu_physical_apicid; extern unsigned int max_physical_apicid; -extern int smp_found_config; extern int mpc_default_type; extern unsigned long mp_lapic_addr; -extern void get_smp_config(void); +#ifdef CONFIG_X86_LOCAL_APIC +extern int smp_found_config; +#else +# define smp_found_config 0 +#endif + +static inline void get_smp_config(void) +{ + x86_init.mpparse.get_smp_config(0); +} + +static inline void early_get_smp_config(void) +{ + x86_init.mpparse.get_smp_config(1); +} + +static inline void find_smp_config(void) +{ + x86_init.mpparse.find_smp_config(1); +} + +static inline void early_find_smp_config(void) +{ + x86_init.mpparse.find_smp_config(0); +} #ifdef CONFIG_X86_MPPARSE -extern void find_smp_config(void); extern void early_reserve_e820_mpc_new(void); extern int enable_update_mptable; +extern int default_mpc_apic_id(struct mpc_cpu *m); +extern void default_smp_read_mpc_oem(struct mpc_table *mpc); +# ifdef CONFIG_X86_IO_APIC +extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str); +# else +# define default_mpc_oem_bus_info NULL +# endif +extern void default_find_smp_config(unsigned int reserve); +extern void default_get_smp_config(unsigned int early); #else -static inline void find_smp_config(void) { } static inline void early_reserve_e820_mpc_new(void) { } #define enable_update_mptable 0 +#define default_mpc_apic_id NULL +#define default_smp_read_mpc_oem NULL +#define default_mpc_oem_bus_info NULL +#define default_find_smp_config x86_init_uint_noop +#define default_get_smp_config x86_init_uint_noop #endif void __cpuinit generic_processor_info(int apicid, int version); @@ -130,14 +163,16 @@ typedef struct physid_mask physid_mask_t; #define physids_shift_left(d, s, n) \ bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) -#define physids_coerce(map) ((map).mask[0]) +static inline unsigned long physids_coerce(physid_mask_t *map) +{ + return map->mask[0]; +} -#define physids_promote(physids) \ - ({ \ - physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ - __physid_mask.mask[0] = physids; \ - __physid_mask; \ - }) +static inline void physids_promote(unsigned long physids, physid_mask_t *map) +{ + physids_clear(*map); + map->mask[0] = physids; +} /* Note: will create very large stack frames if physid_mask_t is big */ #define physid_mask_of_physid(physid) \ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6be7fc254b5..4ffe09b2ad7 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -81,8 +81,15 @@ #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) +#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) +#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) +#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) + /* These are consecutive and not in the normal 4er MCE bank block */ #define MSR_IA32_MC0_CTL2 0x00000280 +#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) + #define CMCI_EN (1ULL << 30) #define CMCI_THRESHOLD_MASK 0xffffULL @@ -215,6 +222,10 @@ #define THERM_STATUS_PROCHOT (1 << 0) +#define MSR_THERM2_CTL 0x0000019d + +#define MSR_THERM2_CTL_TM_SELECT (1ULL << 16) + #define MSR_IA32_MISC_ENABLE 0x000001a0 /* MISC_ENABLE bits: architectural */ @@ -374,6 +385,7 @@ /* AMD-V MSRs */ #define MSR_VM_CR 0xc0010114 +#define MSR_VM_IGNNE 0xc0010115 #define MSR_VM_HSAVE_PA 0xc0010117 #endif /* _ASM_X86_MSR_INDEX_H */ diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index a51ada8467d..4365ffdb461 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -121,6 +121,9 @@ extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); extern void mtrr_ap_init(void); extern void mtrr_bp_init(void); +extern void set_mtrr_aps_delayed_init(void); +extern void mtrr_aps_init(void); +extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); # else @@ -161,6 +164,9 @@ static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) +#define set_mtrr_aps_delayed_init() do {} while (0) +#define mtrr_aps_init() do {} while (0) +#define mtrr_bp_restore() do {} while (0) # endif #ifdef CONFIG_COMPAT diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index e63cf7d441e..139d4c1a33a 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -40,8 +40,7 @@ extern unsigned int nmi_watchdog; #define NMI_INVALID 3 struct ctl_table; -struct file; -extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, +extern int proc_nmi_enabled(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int unknown_nmi_panic; diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index ad2668ee1aa..6d8723a766c 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h @@ -65,6 +65,8 @@ 6: osp nopl 0x00(%eax,%eax,1) 7: nopl 0x00000000(%eax) 8: nopl 0x00000000(%eax,%eax,1) + Note: All the above are assumed to be a single instruction. + There is kernel code that depends on this. */ #define P6_NOP1 GENERIC_NOP1 #define P6_NOP2 ".byte 0x66,0x90\n" diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 40d6586af25..efb38994859 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -24,22 +24,6 @@ static inline void load_sp0(struct tss_struct *tss, PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); } -#define ARCH_SETUP pv_init_ops.arch_setup(); -static inline unsigned long get_wallclock(void) -{ - return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock); -} - -static inline int set_wallclock(unsigned long nowtime) -{ - return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime); -} - -static inline void (*choose_time_init(void))(void) -{ - return pv_time_ops.time_init; -} - /* The paravirtualized CPUID instruction. */ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) @@ -245,7 +229,6 @@ static inline unsigned long long paravirt_sched_clock(void) { return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); } -#define calibrate_tsc() (pv_time_ops.get_tsc_khz()) static inline unsigned long long paravirt_read_pmc(int counter) { @@ -363,34 +346,6 @@ static inline void slow_down_io(void) #endif } -#ifdef CONFIG_X86_LOCAL_APIC -static inline void setup_boot_clock(void) -{ - PVOP_VCALL0(pv_apic_ops.setup_boot_clock); -} - -static inline void setup_secondary_clock(void) -{ - PVOP_VCALL0(pv_apic_ops.setup_secondary_clock); -} -#endif - -static inline void paravirt_post_allocator_init(void) -{ - if (pv_init_ops.post_allocator_init) - (*pv_init_ops.post_allocator_init)(); -} - -static inline void paravirt_pagetable_setup_start(pgd_t *base) -{ - (*pv_mmu_ops.pagetable_setup_start)(base); -} - -static inline void paravirt_pagetable_setup_done(pgd_t *base) -{ - (*pv_mmu_ops.pagetable_setup_done)(base); -} - #ifdef CONFIG_SMP static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, unsigned long start_esp) @@ -885,42 +840,22 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) static inline unsigned long __raw_local_save_flags(void) { - unsigned long f; - - asm volatile(paravirt_alt(PARAVIRT_CALL) - : "=a"(f) - : paravirt_type(pv_irq_ops.save_fl), - paravirt_clobber(CLBR_EAX) - : "memory", "cc"); - return f; + return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl); } static inline void raw_local_irq_restore(unsigned long f) { - asm volatile(paravirt_alt(PARAVIRT_CALL) - : "=a"(f) - : PV_FLAGS_ARG(f), - paravirt_type(pv_irq_ops.restore_fl), - paravirt_clobber(CLBR_EAX) - : "memory", "cc"); + PVOP_VCALLEE1(pv_irq_ops.restore_fl, f); } static inline void raw_local_irq_disable(void) { - asm volatile(paravirt_alt(PARAVIRT_CALL) - : - : paravirt_type(pv_irq_ops.irq_disable), - paravirt_clobber(CLBR_EAX) - : "memory", "eax", "cc"); + PVOP_VCALLEE0(pv_irq_ops.irq_disable); } static inline void raw_local_irq_enable(void) { - asm volatile(paravirt_alt(PARAVIRT_CALL) - : - : paravirt_type(pv_irq_ops.irq_enable), - paravirt_clobber(CLBR_EAX) - : "memory", "eax", "cc"); + PVOP_VCALLEE0(pv_irq_ops.irq_enable); } static inline unsigned long __raw_local_irq_save(void) @@ -948,6 +883,8 @@ static inline unsigned long __raw_local_irq_save(void) #undef PVOP_VCALL4 #undef PVOP_CALL4 +extern void default_banner(void); + #else /* __ASSEMBLY__ */ #define _PVSITE(ptype, clobbers, ops, word, algn) \ @@ -1088,5 +1025,7 @@ static inline unsigned long __raw_local_irq_save(void) #endif /* CONFIG_X86_32 */ #endif /* __ASSEMBLY__ */ -#endif /* CONFIG_PARAVIRT */ +#else /* CONFIG_PARAVIRT */ +# define default_banner x86_init_noop +#endif /* !CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_H */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 25402d0006e..9357473c8da 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -78,14 +78,6 @@ struct pv_init_ops { */ unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, unsigned long addr, unsigned len); - - /* Basic arch-specific setup */ - void (*arch_setup)(void); - char *(*memory_setup)(void); - void (*post_allocator_init)(void); - - /* Print a banner to identify the environment */ - void (*banner)(void); }; @@ -96,12 +88,6 @@ struct pv_lazy_ops { }; struct pv_time_ops { - void (*time_init)(void); - - /* Set and set time of day */ - unsigned long (*get_wallclock)(void); - int (*set_wallclock)(unsigned long); - unsigned long long (*sched_clock)(void); unsigned long (*get_tsc_khz)(void); }; @@ -203,8 +189,6 @@ struct pv_cpu_ops { }; struct pv_irq_ops { - void (*init_IRQ)(void); - /* * Get/set interrupt state. save_fl and restore_fl are only * expected to use X86_EFLAGS_IF; all other bits @@ -229,9 +213,6 @@ struct pv_irq_ops { struct pv_apic_ops { #ifdef CONFIG_X86_LOCAL_APIC - void (*setup_boot_clock)(void); - void (*setup_secondary_clock)(void); - void (*startup_ipi_hook)(int phys_apicid, unsigned long start_eip, unsigned long start_esp); @@ -239,15 +220,6 @@ struct pv_apic_ops { }; struct pv_mmu_ops { - /* - * Called before/after init_mm pagetable setup. setup_start - * may reset %cr3, and may pre-install parts of the pagetable; - * pagetable setup is expected to preserve any existing - * mapping. - */ - void (*pagetable_setup_start)(pgd_t *pgd_base); - void (*pagetable_setup_done)(pgd_t *pgd_base); - unsigned long (*read_cr2)(void); void (*write_cr2)(unsigned long); @@ -522,10 +494,11 @@ int paravirt_disable_iospace(void); #define EXTRA_CLOBBERS #define VEXTRA_CLOBBERS #else /* CONFIG_X86_64 */ +/* [re]ax isn't an arg, but the return val */ #define PVOP_VCALL_ARGS \ unsigned long __edi = __edi, __esi = __esi, \ - __edx = __edx, __ecx = __ecx -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax + __edx = __edx, __ecx = __ecx, __eax = __eax +#define PVOP_CALL_ARGS PVOP_VCALL_ARGS #define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) #define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) @@ -537,6 +510,7 @@ int paravirt_disable_iospace(void); "=c" (__ecx) #define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) +/* void functions are still allowed [re]ax for scratch */ #define PVOP_VCALLEE_CLOBBERS "=a" (__eax) #define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS @@ -611,8 +585,8 @@ int paravirt_disable_iospace(void); VEXTRA_CLOBBERS, \ pre, post, ##__VA_ARGS__) -#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ +#define __PVOP_VCALLEESAVE(op, pre, post, ...) \ + ____PVOP_VCALL(op.func, CLBR_RET_REG, \ PVOP_VCALLEE_CLOBBERS, , \ pre, post, ##__VA_ARGS__) diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 7af14e512f9..e2c1668dde7 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -19,4 +19,9 @@ extern int free_memtype(u64 start, u64 end); extern int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flag); +int io_reserve_memtype(resource_size_t start, resource_size_t end, + unsigned long *type); + +void io_free_memtype(resource_size_t start, resource_size_t end); + #endif /* _ASM_X86_PAT_H */ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 1ff685ca221..ada8c201d51 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -48,7 +48,6 @@ extern unsigned int pcibios_assign_all_busses(void); #else #define pcibios_assign_all_busses() 0 #endif -#define pcibios_scan_all_fns(a, b) 0 extern unsigned long pci_mem_start; #define PCIBIOS_MIN_IO 0x1000 @@ -144,7 +143,11 @@ static inline int __pcibus_to_node(const struct pci_bus *bus) static inline const struct cpumask * cpumask_of_pcibus(const struct pci_bus *bus) { - return cpumask_of_node(__pcibus_to_node(bus)); + int node; + + node = __pcibus_to_node(bus); + return (node == -1) ? cpu_online_mask : + cpumask_of_node(node); } #endif diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 04eacefcfd2..b65a36defeb 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -168,15 +168,6 @@ do { \ /* We can use this directly for local CPU (faster). */ DECLARE_PER_CPU(unsigned long, this_cpu_off); -#ifdef CONFIG_NEED_MULTIPLE_NODES -void *pcpu_lpage_remapped(void *kaddr); -#else -static inline void *pcpu_lpage_remapped(void *kaddr) -{ - return NULL; -} -#endif - #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_event.h index e7b7c938ae2..8d9f8548a87 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_event.h @@ -1,8 +1,8 @@ -#ifndef _ASM_X86_PERF_COUNTER_H -#define _ASM_X86_PERF_COUNTER_H +#ifndef _ASM_X86_PERF_EVENT_H +#define _ASM_X86_PERF_EVENT_H /* - * Performance counter hw details: + * Performance event hw details: */ #define X86_PMC_MAX_GENERIC 8 @@ -28,9 +28,20 @@ */ #define ARCH_PERFMON_EVENT_MASK 0xffff +/* + * filter mask to validate fixed counter events. + * the following filters disqualify for fixed counters: + * - inv + * - edge + * - cnt-mask + * The other filters are supported by fixed counters. + * The any-thread option is supported starting with v3. + */ +#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000 + #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) @@ -43,7 +54,7 @@ union cpuid10_eax { struct { unsigned int version_id:8; - unsigned int num_counters:8; + unsigned int num_events:8; unsigned int bit_width:8; unsigned int mask_length:8; } split; @@ -52,7 +63,7 @@ union cpuid10_eax { union cpuid10_edx { struct { - unsigned int num_counters_fixed:4; + unsigned int num_events_fixed:4; unsigned int reserved:28; } split; unsigned int full; @@ -60,7 +71,7 @@ union cpuid10_edx { /* - * Fixed-purpose performance counters: + * Fixed-purpose performance events: */ /* @@ -87,22 +98,22 @@ union cpuid10_edx { /* * We model BTS tracing as another fixed-mode PMC. * - * We choose a value in the middle of the fixed counter range, since lower - * values are used by actual fixed counters and higher values are used + * We choose a value in the middle of the fixed event range, since lower + * values are used by actual fixed events and higher values are used * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. */ #define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) -#ifdef CONFIG_PERF_COUNTERS -extern void init_hw_perf_counters(void); -extern void perf_counters_lapic_init(void); +#ifdef CONFIG_PERF_EVENTS +extern void init_hw_perf_events(void); +extern void perf_events_lapic_init(void); -#define PERF_COUNTER_INDEX_OFFSET 0 +#define PERF_EVENT_INDEX_OFFSET 0 #else -static inline void init_hw_perf_counters(void) { } -static inline void perf_counters_lapic_init(void) { } +static inline void init_hw_perf_events(void) { } +static inline void perf_events_lapic_init(void) { } #endif -#endif /* _ASM_X86_PERF_COUNTER_H */ +#endif /* _ASM_X86_PERF_EVENT_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 4c5b51fdc78..af6fd360ab3 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -56,16 +56,6 @@ extern struct list_head pgd_list; #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) -static inline void __init paravirt_pagetable_setup_start(pgd_t *base) -{ - native_pagetable_setup_start(base); -} - -static inline void __init paravirt_pagetable_setup_done(pgd_t *base) -{ - native_pagetable_setup_done(base); -} - #define pgd_val(x) native_pgd_val(x) #define __pgd(x) native_make_pgd(x) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 54cb697f490..d1f4a760be2 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -277,6 +277,7 @@ static inline pteval_t pte_flags(pte_t pte) typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; +extern void set_nx(void); extern int nx_enabled; #define pgprot_writecombine pgprot_writecombine @@ -299,8 +300,8 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte); extern void native_pagetable_setup_start(pgd_t *base); extern void native_pagetable_setup_done(pgd_t *base); #else -static inline void native_pagetable_setup_start(pgd_t *base) {} -static inline void native_pagetable_setup_done(pgd_t *base) {} +#define native_pagetable_setup_start x86_init_pgd_noop +#define native_pagetable_setup_done x86_init_pgd_noop #endif struct seq_file; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e08ea043e08..6f8ec1c37e0 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -27,8 +27,10 @@ struct mm_struct; #include <linux/cpumask.h> #include <linux/cache.h> #include <linux/threads.h> +#include <linux/math64.h> #include <linux/init.h> +#define HBP_NUM 4 /* * Default implementation of macro that returns current * instruction pointer ("program counter"). @@ -421,6 +423,8 @@ extern unsigned int xstate_size; extern void free_thread_xstate(struct task_struct *); extern struct kmem_cache *task_xstate_cachep; +struct perf_event; + struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; @@ -442,13 +446,10 @@ struct thread_struct { unsigned long fs; #endif unsigned long gs; - /* Hardware debugging registers: */ - unsigned long debugreg0; - unsigned long debugreg1; - unsigned long debugreg2; - unsigned long debugreg3; - unsigned long debugreg6; - unsigned long debugreg7; + /* Save middle states of ptrace breakpoints */ + struct perf_event *ptrace_bps[HBP_NUM]; + /* Debug status used for traps, single steps, etc... */ + unsigned long debugreg6; /* Fault info: */ unsigned long cr2; unsigned long trap_no; @@ -999,7 +1000,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8)) #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) -#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */ +extern unsigned long KSTK_ESP(struct task_struct *task); #endif /* CONFIG_X86_64 */ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, @@ -1020,4 +1021,35 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); +extern int amd_get_nb_id(int cpu); + +struct aperfmperf { + u64 aperf, mperf; +}; + +static inline void get_aperfmperf(struct aperfmperf *am) +{ + WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF)); + + rdmsrl(MSR_IA32_APERF, am->aperf); + rdmsrl(MSR_IA32_MPERF, am->mperf); +} + +#define APERFMPERF_SHIFT 10 + +static inline +unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, + struct aperfmperf *new) +{ + u64 aperf = new->aperf - old->aperf; + u64 mperf = new->mperf - old->mperf; + unsigned long ratio = aperf; + + mperf >>= APERFMPERF_SHIFT; + if (mperf) + ratio = div64_u64(aperf, mperf); + + return ratio; +} + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 0f0d908349a..3d11fd0f44c 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -7,6 +7,7 @@ #ifdef __KERNEL__ #include <asm/segment.h> +#include <asm/page_types.h> #endif #ifndef __ASSEMBLY__ @@ -216,6 +217,67 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs) return regs->sp; } +/* Query offset/name of register from its name/offset */ +extern int regs_query_register_offset(const char *name); +extern const char *regs_query_register_name(unsigned int offset); +#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss)) + +/** + * regs_get_register() - get register value from its offset + * @regs: pt_regs from which register value is gotten. + * @offset: offset number of the register. + * + * regs_get_register returns the value of a register. The @offset is the + * offset of the register in struct pt_regs address which specified by @regs. + * If @offset is bigger than MAX_REG_OFFSET, this returns 0. + */ +static inline unsigned long regs_get_register(struct pt_regs *regs, + unsigned int offset) +{ + if (unlikely(offset > MAX_REG_OFFSET)) + return 0; + return *(unsigned long *)((unsigned long)regs + offset); +} + +/** + * regs_within_kernel_stack() - check the address in the stack + * @regs: pt_regs which contains kernel stack pointer. + * @addr: address which is checked. + * + * regs_within_kernel_stack() checks @addr is within the kernel stack page(s). + * If @addr is within the kernel stack, it returns true. If not, returns false. + */ +static inline int regs_within_kernel_stack(struct pt_regs *regs, + unsigned long addr) +{ + return ((addr & ~(THREAD_SIZE - 1)) == + (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))); +} + +/** + * regs_get_kernel_stack_nth() - get Nth entry of the stack + * @regs: pt_regs which contains kernel stack pointer. + * @n: stack entry number. + * + * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which + * is specified by @regs. If the @n th entry is NOT in the kernel stack, + * this returns 0. + */ +static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, + unsigned int n) +{ + unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); + addr += n; + if (regs_within_kernel_stack(regs, (unsigned long)addr)) + return *addr; + else + return 0; +} + +/* Get Nth argument at function call */ +extern unsigned long regs_get_argument_nth(struct pt_regs *regs, + unsigned int n); + /* * These are defined as per linux/ptrace.h, which see. */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 4093d1ed6db..18e496c98ff 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -5,43 +5,6 @@ #define COMMAND_LINE_SIZE 2048 -#ifndef __ASSEMBLY__ - -/* - * Any setup quirks to be performed? - */ -struct mpc_cpu; -struct mpc_bus; -struct mpc_oemtable; - -struct x86_quirks { - int (*arch_pre_time_init)(void); - int (*arch_time_init)(void); - int (*arch_pre_intr_init)(void); - int (*arch_intr_init)(void); - int (*arch_trap_init)(void); - char * (*arch_memory_setup)(void); - int (*mach_get_smp_config)(unsigned int early); - int (*mach_find_smp_config)(unsigned int reserve); - - int *mpc_record; - int (*mpc_apic_id)(struct mpc_cpu *m); - void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); - void (*mpc_oem_pci_bus)(struct mpc_bus *m); - void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable, - unsigned short oemsize); - int (*setup_ioapic_ids)(void); -}; - -extern void x86_quirk_intr_init(void); - -extern void x86_quirk_trap_init(void); - -extern void x86_quirk_pre_time_init(void); -extern void x86_quirk_time_init(void); - -#endif /* __ASSEMBLY__ */ - #ifdef __i386__ #include <linux/pfn.h> @@ -61,6 +24,7 @@ extern void x86_quirk_time_init(void); #ifndef __ASSEMBLY__ #include <asm/bootparam.h> +#include <asm/x86_init.h> /* Interrupt control for vSMPowered x86_64 systems */ #ifdef CONFIG_X86_64 @@ -79,11 +43,16 @@ static inline void visws_early_detect(void) { } static inline int is_visws_box(void) { return 0; } #endif -extern struct x86_quirks *x86_quirks; extern unsigned long saved_video_mode; -#ifndef CONFIG_PARAVIRT -#define paravirt_post_allocator_init() do {} while (0) +extern void reserve_standard_io_resources(void); +extern void i386_reserve_resources(void); +extern void setup_default_timer_irq(void); + +#ifdef CONFIG_X86_MRST +extern void x86_mrst_early_setup(void); +#else +static inline void x86_mrst_early_setup(void) { } #endif #ifndef _SETUP diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 6a84ed166ae..1e796782cd7 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -121,7 +121,6 @@ static inline void arch_send_call_function_single_ipi(int cpu) smp_ops.send_call_func_single_ipi(cpu); } -#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) { smp_ops.send_call_func_ipi(mask); diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index c86f452256d..3d3e8353ee5 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -65,7 +65,6 @@ static __always_inline void *__constant_memcpy(void *to, const void *from, case 4: *(int *)to = *(int *)from; return to; - case 3: *(short *)to = *(short *)from; *((char *)to + 2) = *((char *)from + 2); @@ -178,10 +177,15 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) */ #ifndef CONFIG_KMEMCHECK + +#if (__GNUC__ >= 4) +#define memcpy(t, f, n) __builtin_memcpy(t, f, n) +#else #define memcpy(t, f, n) \ (__builtin_constant_p((n)) \ ? __constant_memcpy((t), (f), (n)) \ : __memcpy((t), (f), (n))) +#endif #else /* * kmemcheck becomes very happy if we use the REP instructions unconditionally, @@ -317,11 +321,15 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern, : __memset_generic((s), (c), (count))) #define __HAVE_ARCH_MEMSET +#if (__GNUC__ >= 4) +#define memset(s, c, count) __builtin_memset(s, c, count) +#else #define memset(s, c, count) \ (__builtin_constant_p(c) \ ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \ (count)) \ : __memset((s), (c), (count))) +#endif /* * find the first occurrence of byte 'c', or 1 past the area if none diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index b9e4e20174f..87ffcb12a1b 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h @@ -3,17 +3,14 @@ #include <linux/swiotlb.h> -/* SWIOTLB interface */ - -extern int swiotlb_force; - #ifdef CONFIG_SWIOTLB extern int swiotlb; -extern void pci_swiotlb_init(void); +extern int pci_swiotlb_init(void); #else #define swiotlb 0 -static inline void pci_swiotlb_init(void) +static inline int pci_swiotlb_init(void) { + return 0; } #endif diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index d82f39bb790..8d33bc5462d 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -1,7 +1,7 @@ /* * Access to user system call parameters and results * - * Copyright (C) 2008 Red Hat, Inc. All rights reserved. + * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -16,13 +16,13 @@ #include <linux/sched.h> #include <linux/err.h> -static inline long syscall_get_nr(struct task_struct *task, - struct pt_regs *regs) +/* + * Only the low 32 bits of orig_ax are meaningful, so we return int. + * This importantly ignores the high bits on 64-bit, so comparisons + * sign-extend the low 32 bits. + */ +static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { - /* - * We always sign-extend a -1 value being set here, - * so this is always either -1L or a syscall number. - */ return regs->orig_ax; } diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index f08f9737489..022a84386de 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -128,8 +128,6 @@ do { \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ "call __switch_to\n\t" \ - ".globl thread_return\n" \ - "thread_return:\n\t" \ "movq "__percpu_arg([current_task])",%%rsi\n\t" \ __switch_canary \ "movq %P[thread_info](%%rsi),%%r8\n\t" \ @@ -157,19 +155,22 @@ extern void native_load_gs_index(unsigned); * Load a segment. Fall back on loading the zero * segment if something goes wrong.. */ -#define loadsegment(seg, value) \ - asm volatile("\n" \ - "1:\t" \ - "movl %k0,%%" #seg "\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3:\t" \ - "movl %k1, %%" #seg "\n\t" \ - "jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b,3b) \ - : :"r" (value), "r" (0) : "memory") - +#define loadsegment(seg, value) \ +do { \ + unsigned short __val = (value); \ + \ + asm volatile(" \n" \ + "1: movl %k0,%%" #seg " \n" \ + \ + ".section .fixup,\"ax\" \n" \ + "2: xorl %k0,%k0 \n" \ + " jmp 1b \n" \ + ".previous \n" \ + \ + _ASM_EXTABLE(1b, 2b) \ + \ + : "+r" (__val) : : "memory"); \ +} while (0) /* * Save a segment register away diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h index 50c733aac42..7bdec4e9b73 100644 --- a/arch/x86/include/asm/time.h +++ b/arch/x86/include/asm/time.h @@ -4,60 +4,7 @@ extern void hpet_time_init(void); #include <asm/mc146818rtc.h> -#ifdef CONFIG_X86_32 -#include <linux/efi.h> - -static inline unsigned long native_get_wallclock(void) -{ - unsigned long retval; - - if (efi_enabled) - retval = efi_get_time(); - else - retval = mach_get_cmos_time(); - - return retval; -} - -static inline int native_set_wallclock(unsigned long nowtime) -{ - int retval; - - if (efi_enabled) - retval = efi_set_rtc_mmss(nowtime); - else - retval = mach_set_rtc_mmss(nowtime); - - return retval; -} - -#else -extern void native_time_init_hook(void); - -static inline unsigned long native_get_wallclock(void) -{ - return mach_get_cmos_time(); -} - -static inline int native_set_wallclock(unsigned long nowtime) -{ - return mach_set_rtc_mmss(nowtime); -} - -#endif extern void time_init(void); -#ifdef CONFIG_PARAVIRT -#include <asm/paravirt.h> -#else /* !CONFIG_PARAVIRT */ - -#define get_wallclock() native_get_wallclock() -#define set_wallclock(x) native_set_wallclock(x) -#define choose_time_init() hpet_time_init - -#endif /* CONFIG_PARAVIRT */ - -extern unsigned long __init calibrate_cpu(void); - #endif /* _ASM_X86_TIME_H */ diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 20ca9c4d468..5469630b27f 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -8,20 +8,16 @@ #define TICK_SIZE (tick_nsec / 1000) unsigned long long native_sched_clock(void); -unsigned long native_calibrate_tsc(void); +extern int recalibrate_cpu_khz(void); -#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) extern int timer_ack; -extern irqreturn_t timer_interrupt(int irq, void *dev_id); -#endif /* CONFIG_X86_32 */ -extern int recalibrate_cpu_khz(void); +#else +# define timer_ack (0) +#endif extern int no_timer_check; -#ifndef CONFIG_PARAVIRT -#define calibrate_tsc() native_calibrate_tsc() -#endif - /* Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 26d06e052a1..40e37b10c6c 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -116,15 +116,11 @@ extern unsigned long node_remap_size[]; # define SD_CACHE_NICE_TRIES 1 # define SD_IDLE_IDX 1 -# define SD_NEWIDLE_IDX 2 -# define SD_FORKEXEC_IDX 0 #else # define SD_CACHE_NICE_TRIES 2 # define SD_IDLE_IDX 2 -# define SD_NEWIDLE_IDX 2 -# define SD_FORKEXEC_IDX 1 #endif @@ -137,22 +133,21 @@ extern unsigned long node_remap_size[]; .cache_nice_tries = SD_CACHE_NICE_TRIES, \ .busy_idx = 3, \ .idle_idx = SD_IDLE_IDX, \ - .newidle_idx = SD_NEWIDLE_IDX, \ - .wake_idx = 1, \ - .forkexec_idx = SD_FORKEXEC_IDX, \ + .newidle_idx = 0, \ + .wake_idx = 0, \ + .forkexec_idx = 0, \ \ .flags = 1*SD_LOAD_BALANCE \ | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 0*SD_WAKE_IDLE \ + | 0*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ - | 1*SD_WAKE_BALANCE \ + | 0*SD_PREFER_LOCAL \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ | 0*SD_SHARE_PKG_RESOURCES \ | 1*SD_SERIALIZE \ - | 1*SD_WAKE_IDLE_FAR \ | 0*SD_PREFER_SIBLING \ , \ .last_balance = jiffies, \ @@ -171,21 +166,11 @@ static inline int numa_node_id(void) return 0; } -static inline int cpu_to_node(int cpu) -{ - return 0; -} - static inline int early_cpu_to_node(int cpu) { return 0; } -static inline const struct cpumask *cpumask_of_node(int node) -{ - return cpu_online_mask; -} - static inline void setup_node_to_cpumask_map(void) { } #endif diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 38ae163cc91..c0427295e8f 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -48,7 +48,8 @@ static __always_inline cycles_t vget_cycles(void) extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); -int check_tsc_unstable(void); +extern int check_tsc_unstable(void); +extern unsigned long native_calibrate_tsc(void); /* * Boot-time check whether the TSCs are synchronized across diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index d2c6c930b49..abd3e0ea762 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -570,7 +570,6 @@ extern struct movsl_mask { #ifdef CONFIG_X86_32 # include "uaccess_32.h" #else -# define ARCH_HAS_SEARCH_EXTABLE # include "uaccess_64.h" #endif diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 5e06259e90e..0c9825e97f3 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -33,7 +33,7 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero * Copy data from kernel space to user space. Caller must check * the specified block with access_ok() before calling this function. * The caller should also make sure he pins the user space address - * so that the we don't result in page fault and sleep. + * so that we don't result in page fault and sleep. * * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault * we return the initial request size (1, 2 or 4), as copy_*_user should do. @@ -187,9 +187,34 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from, unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n); -unsigned long __must_check copy_from_user(void *to, +unsigned long __must_check _copy_from_user(void *to, const void __user *from, unsigned long n); + + +extern void copy_from_user_overflow(void) +#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS + __compiletime_error("copy_from_user() buffer size is not provably correct") +#else + __compiletime_warning("copy_from_user() buffer size is not provably correct") +#endif +; + +static inline unsigned long __must_check copy_from_user(void *to, + const void __user *from, + unsigned long n) +{ + int sz = __compiletime_object_size(to); + int ret = -EFAULT; + + if (likely(sz == -1 || sz >= n)) + ret = _copy_from_user(to, from, n); + else + copy_from_user_overflow(); + + return ret; +} + long __must_check strncpy_from_user(char *dst, const char __user *src, long count); long __must_check __strncpy_from_user(char *dst, diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index db24b215fc5..46324c6a4f6 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -19,12 +19,37 @@ __must_check unsigned long copy_user_generic(void *to, const void *from, unsigned len); __must_check unsigned long -copy_to_user(void __user *to, const void *from, unsigned len); +_copy_to_user(void __user *to, const void *from, unsigned len); __must_check unsigned long -copy_from_user(void *to, const void __user *from, unsigned len); +_copy_from_user(void *to, const void __user *from, unsigned len); __must_check unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len); +static inline unsigned long __must_check copy_from_user(void *to, + const void __user *from, + unsigned long n) +{ + int sz = __compiletime_object_size(to); + int ret = -EFAULT; + + might_fault(); + if (likely(sz == -1 || sz >= n)) + ret = _copy_from_user(to, from, n); +#ifdef CONFIG_DEBUG_VM + else + WARN(1, "Buffer overflow detected!\n"); +#endif + return ret; +} + +static __always_inline __must_check +int copy_to_user(void __user *dst, const void *src, unsigned size) +{ + might_fault(); + + return _copy_to_user(dst, src, size); +} + static __always_inline __must_check int __copy_from_user(void *dst, const void __user *src, unsigned size) { @@ -176,8 +201,11 @@ __must_check long strlen_user(const char __user *str); __must_check unsigned long clear_user(void __user *mem, unsigned long len); __must_check unsigned long __clear_user(void __user *mem, unsigned long len); -__must_check long __copy_from_user_inatomic(void *dst, const void __user *src, - unsigned size); +static __must_check __always_inline int +__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size) +{ + return copy_user_generic(dst, (__force const void *)src, size); +} static __must_check __always_inline int __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 8deaada61bc..6fb3c209a7e 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -341,7 +341,7 @@ #define __NR_preadv 333 #define __NR_pwritev 334 #define __NR_rt_tgsigqueueinfo 335 -#define __NR_perf_counter_open 336 +#define __NR_perf_event_open 336 #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index b9f3c60de5f..8d3ad0adbc6 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -659,8 +659,8 @@ __SYSCALL(__NR_preadv, sys_preadv) __SYSCALL(__NR_pwritev, sys_pwritev) #define __NR_rt_tgsigqueueinfo 297 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) -#define __NR_perf_counter_open 298 -__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) +#define __NR_perf_event_open 298 +__SYSCALL(__NR_perf_event_open, sys_perf_event_open) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 77a68505419..d1414af9855 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -15,9 +15,12 @@ #include <linux/numa.h> #include <linux/percpu.h> #include <linux/timer.h> +#include <linux/io.h> #include <asm/types.h> #include <asm/percpu.h> #include <asm/uv/uv_mmrs.h> +#include <asm/irq_vectors.h> +#include <asm/io_apic.h> /* @@ -113,7 +116,7 @@ /* * The largest possible NASID of a C or M brick (+ 2) */ -#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2) +#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_BLADES * 2) struct uv_scir_s { struct timer_list timer; @@ -229,6 +232,20 @@ static inline unsigned long uv_gpa(void *v) return uv_soc_phys_ram_to_gpa(__pa(v)); } +/* gnode -> pnode */ +static inline unsigned long uv_gpa_to_gnode(unsigned long gpa) +{ + return gpa >> uv_hub_info->m_val; +} + +/* gpa -> pnode */ +static inline int uv_gpa_to_pnode(unsigned long gpa) +{ + unsigned long n_mask = (1UL << uv_hub_info->n_val) - 1; + + return uv_gpa_to_gnode(gpa) & n_mask; +} + /* pnode, offset --> socket virtual */ static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) { @@ -258,13 +275,13 @@ static inline unsigned long *uv_global_mmr32_address(int pnode, static inline void uv_write_global_mmr32(int pnode, unsigned long offset, unsigned long val) { - *uv_global_mmr32_address(pnode, offset) = val; + writeq(val, uv_global_mmr32_address(pnode, offset)); } static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset) { - return *uv_global_mmr32_address(pnode, offset); + return readq(uv_global_mmr32_address(pnode, offset)); } /* @@ -281,13 +298,13 @@ static inline unsigned long *uv_global_mmr64_address(int pnode, static inline void uv_write_global_mmr64(int pnode, unsigned long offset, unsigned long val) { - *uv_global_mmr64_address(pnode, offset) = val; + writeq(val, uv_global_mmr64_address(pnode, offset)); } static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset) { - return *uv_global_mmr64_address(pnode, offset); + return readq(uv_global_mmr64_address(pnode, offset)); } /* @@ -301,22 +318,22 @@ static inline unsigned long *uv_local_mmr_address(unsigned long offset) static inline unsigned long uv_read_local_mmr(unsigned long offset) { - return *uv_local_mmr_address(offset); + return readq(uv_local_mmr_address(offset)); } static inline void uv_write_local_mmr(unsigned long offset, unsigned long val) { - *uv_local_mmr_address(offset) = val; + writeq(val, uv_local_mmr_address(offset)); } static inline unsigned char uv_read_local_mmr8(unsigned long offset) { - return *((unsigned char *)uv_local_mmr_address(offset)); + return readb(uv_local_mmr_address(offset)); } static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val) { - *((unsigned char *)uv_local_mmr_address(offset)) = val; + writeb(val, uv_local_mmr_address(offset)); } /* @@ -420,9 +437,14 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) { unsigned long val; + unsigned long dmode = dest_Fixed; + + if (vector == NMI_VECTOR) + dmode = dest_NMI; val = (1UL << UVH_IPI_INT_SEND_SHFT) | - ((apicid & 0x3f) << UVH_IPI_INT_APIC_ID_SHFT) | + ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | + (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | (vector << UVH_IPI_INT_VECTOR_SHFT); uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h index 9613c8c0b64..d6b17c76062 100644 --- a/arch/x86/include/asm/uv/uv_irq.h +++ b/arch/x86/include/asm/uv/uv_irq.h @@ -25,12 +25,14 @@ struct uv_IO_APIC_route_entry { dest : 32; }; -extern struct irq_chip uv_irq_chip; - -extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long); -extern void arch_disable_uv_irq(int, unsigned long); +enum { + UV_AFFINITY_ALL, + UV_AFFINITY_NODE, + UV_AFFINITY_CPU +}; -extern int uv_setup_irq(char *, int, int, unsigned long); -extern void uv_teardown_irq(unsigned int, int, unsigned long); +extern int uv_irq_2_mmr_info(int, unsigned long *, int *); +extern int uv_setup_irq(char *, int, int, unsigned long, int); +extern void uv_teardown_irq(unsigned int); #endif /* _ASM_X86_UV_UV_IRQ_H */ diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index dc27a69e5d2..3d61e204826 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -21,6 +21,7 @@ struct vsyscall_gtod_data { u32 shift; } clock; struct timespec wall_to_monotonic; + struct timespec wall_time_coarse; }; extern struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data; diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h index c11b7e100d8..e49ed6d2fd4 100644 --- a/arch/x86/include/asm/vmware.h +++ b/arch/x86/include/asm/vmware.h @@ -20,7 +20,7 @@ #ifndef ASM_X86__VMWARE_H #define ASM_X86__VMWARE_H -extern unsigned long vmware_get_tsc_khz(void); +extern void vmware_platform_setup(void); extern int vmware_platform(void); extern void vmware_set_feature_bits(struct cpuinfo_x86 *c); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 11be5ad2e0e..272514c2d45 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -55,6 +55,7 @@ #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 +#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 #define PIN_BASED_EXT_INTR_MASK 0x00000001 @@ -351,9 +352,16 @@ enum vmcs_field { #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_EPT_EXTENT_CONTEXT 1 #define VMX_EPT_EXTENT_GLOBAL 2 + +#define VMX_EPT_EXECUTE_ONLY_BIT (1ull) +#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) +#define VMX_EPTP_UC_BIT (1ull << 8) +#define VMX_EPTP_WB_BIT (1ull << 14) +#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) + #define VMX_EPT_DEFAULT_GAW 3 #define VMX_EPT_MAX_GAW 0x4 #define VMX_EPT_MT_EPTE_SHIFT 3 diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h new file mode 100644 index 00000000000..d8e71459f02 --- /dev/null +++ b/arch/x86/include/asm/x86_init.h @@ -0,0 +1,143 @@ +#ifndef _ASM_X86_PLATFORM_H +#define _ASM_X86_PLATFORM_H + +#include <asm/pgtable_types.h> +#include <asm/bootparam.h> + +struct mpc_bus; +struct mpc_cpu; +struct mpc_table; + +/** + * struct x86_init_mpparse - platform specific mpparse ops + * @mpc_record: platform specific mpc record accounting + * @setup_ioapic_ids: platform specific ioapic id override + * @mpc_apic_id: platform specific mpc apic id assignment + * @smp_read_mpc_oem: platform specific oem mpc table setup + * @mpc_oem_pci_bus: platform specific pci bus setup (default NULL) + * @mpc_oem_bus_info: platform specific mpc bus info + * @find_smp_config: find the smp configuration + * @get_smp_config: get the smp configuration + */ +struct x86_init_mpparse { + void (*mpc_record)(unsigned int mode); + void (*setup_ioapic_ids)(void); + int (*mpc_apic_id)(struct mpc_cpu *m); + void (*smp_read_mpc_oem)(struct mpc_table *mpc); + void (*mpc_oem_pci_bus)(struct mpc_bus *m); + void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); + void (*find_smp_config)(unsigned int reserve); + void (*get_smp_config)(unsigned int early); +}; + +/** + * struct x86_init_resources - platform specific resource related ops + * @probe_roms: probe BIOS roms + * @reserve_resources: reserve the standard resources for the + * platform + * @memory_setup: platform specific memory setup + * + */ +struct x86_init_resources { + void (*probe_roms)(void); + void (*reserve_resources)(void); + char *(*memory_setup)(void); +}; + +/** + * struct x86_init_irqs - platform specific interrupt setup + * @pre_vector_init: init code to run before interrupt vectors + * are set up. + * @intr_init: interrupt init code + * @trap_init: platform specific trap setup + */ +struct x86_init_irqs { + void (*pre_vector_init)(void); + void (*intr_init)(void); + void (*trap_init)(void); +}; + +/** + * struct x86_init_oem - oem platform specific customizing functions + * @arch_setup: platform specific architecure setup + * @banner: print a platform specific banner + */ +struct x86_init_oem { + void (*arch_setup)(void); + void (*banner)(void); +}; + +/** + * struct x86_init_paging - platform specific paging functions + * @pagetable_setup_start: platform specific pre paging_init() call + * @pagetable_setup_done: platform specific post paging_init() call + */ +struct x86_init_paging { + void (*pagetable_setup_start)(pgd_t *base); + void (*pagetable_setup_done)(pgd_t *base); +}; + +/** + * struct x86_init_timers - platform specific timer setup + * @setup_perpcu_clockev: set up the per cpu clock event device for the + * boot cpu + * @tsc_pre_init: platform function called before TSC init + * @timer_init: initialize the platform timer (default PIT/HPET) + */ +struct x86_init_timers { + void (*setup_percpu_clockev)(void); + void (*tsc_pre_init)(void); + void (*timer_init)(void); +}; + +/** + * struct x86_init_iommu - platform specific iommu setup + * @iommu_init: platform specific iommu setup + */ +struct x86_init_iommu { + int (*iommu_init)(void); +}; + +/** + * struct x86_init_ops - functions for platform specific setup + * + */ +struct x86_init_ops { + struct x86_init_resources resources; + struct x86_init_mpparse mpparse; + struct x86_init_irqs irqs; + struct x86_init_oem oem; + struct x86_init_paging paging; + struct x86_init_timers timers; + struct x86_init_iommu iommu; +}; + +/** + * struct x86_cpuinit_ops - platform specific cpu hotplug setups + * @setup_percpu_clockev: set up the per cpu clock event device + */ +struct x86_cpuinit_ops { + void (*setup_percpu_clockev)(void); +}; + +/** + * struct x86_platform_ops - platform specific runtime functions + * @calibrate_tsc: calibrate TSC + * @get_wallclock: get time from HW clock like RTC etc. + * @set_wallclock: set time back to HW clock + */ +struct x86_platform_ops { + unsigned long (*calibrate_tsc)(void); + unsigned long (*get_wallclock)(void); + int (*set_wallclock)(unsigned long nowtime); + void (*iommu_shutdown)(void); +}; + +extern struct x86_init_ops x86_init; +extern struct x86_cpuinit_ops x86_cpuinit; +extern struct x86_platform_ops x86_platform; + +extern void x86_init_noop(void); +extern void x86_init_uint_noop(unsigned int unused); + +#endif diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 430d5b24af7..4f2e66e29ec 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -31,8 +31,8 @@ GCOV_PROFILE_paravirt.o := n obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o -obj-y += setup.o i8259.o irqinit.o +obj-y += time.o ioport.o ldt.o dumpstack.o +obj-y += setup.o x86_init.o i8259.o irqinit.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o @@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o -obj-y += alternative.o i8253.o pci-nommu.o +obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o @@ -52,9 +52,11 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o +obj-$(CONFIG_INTEL_TXT) += tboot.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ +obj-$(CONFIG_SFI) += sfi.o obj-y += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o @@ -104,6 +106,7 @@ obj-$(CONFIG_SCx200) += scx200.o scx200-y += scx200_32.o obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_X86_MRST) += mrst.o microcode-y := microcode_core.o microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 8c44c232efc..59cdfa4686b 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, * P4, Core and beyond CPUs */ if (c->x86_vendor == X86_VENDOR_INTEL && - (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14))) + (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14))) flags->bm_control = 0; } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index d296f4a195c..d85d1b2432b 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -79,7 +79,8 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) struct cpuinfo_x86 *c = &cpu_data(pr->id); pr->pdc = NULL; - if (c->x86_vendor == X86_VENDOR_INTEL) + if (c->x86_vendor == X86_VENDOR_INTEL || + c->x86_vendor == X86_VENDOR_CENTAUR) init_intel_pdc(pr, c); return; diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S index 7da00b799cd..060fff8f5c5 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S @@ -57,5 +57,8 @@ SECTIONS *(.note*) } + /* + * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + */ . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); } diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 98f230f6a28..32fb09102a1 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -28,6 +28,7 @@ #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> +#include <asm/amd_iommu_proto.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> @@ -56,20 +57,115 @@ struct iommu_cmd { u32 data[4]; }; -static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, - struct unity_map_entry *e); -static struct dma_ops_domain *find_protection_domain(u16 devid); -static u64 *alloc_pte(struct protection_domain *domain, - unsigned long address, int end_lvl, - u64 **pte_page, gfp_t gfp); -static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, - unsigned long start_page, - unsigned int pages); static void reset_iommu_command_buffer(struct amd_iommu *iommu); -static u64 *fetch_pte(struct protection_domain *domain, - unsigned long address, int map_size); static void update_domain(struct protection_domain *domain); +/**************************************************************************** + * + * Helper functions + * + ****************************************************************************/ + +static inline u16 get_device_id(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return calc_devid(pdev->bus->number, pdev->devfn); +} + +static struct iommu_dev_data *get_dev_data(struct device *dev) +{ + return dev->archdata.iommu; +} + +/* + * In this function the list of preallocated protection domains is traversed to + * find the domain for a specific device + */ +static struct dma_ops_domain *find_protection_domain(u16 devid) +{ + struct dma_ops_domain *entry, *ret = NULL; + unsigned long flags; + u16 alias = amd_iommu_alias_table[devid]; + + if (list_empty(&iommu_pd_list)) + return NULL; + + spin_lock_irqsave(&iommu_pd_list_lock, flags); + + list_for_each_entry(entry, &iommu_pd_list, list) { + if (entry->target_dev == devid || + entry->target_dev == alias) { + ret = entry; + break; + } + } + + spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + + return ret; +} + +/* + * This function checks if the driver got a valid device from the caller to + * avoid dereferencing invalid pointers. + */ +static bool check_device(struct device *dev) +{ + u16 devid; + + if (!dev || !dev->dma_mask) + return false; + + /* No device or no PCI device */ + if (!dev || dev->bus != &pci_bus_type) + return false; + + devid = get_device_id(dev); + + /* Out of our scope? */ + if (devid > amd_iommu_last_bdf) + return false; + + if (amd_iommu_rlookup_table[devid] == NULL) + return false; + + return true; +} + +static int iommu_init_device(struct device *dev) +{ + struct iommu_dev_data *dev_data; + struct pci_dev *pdev; + u16 devid, alias; + + if (dev->archdata.iommu) + return 0; + + dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); + if (!dev_data) + return -ENOMEM; + + dev_data->dev = dev; + + devid = get_device_id(dev); + alias = amd_iommu_alias_table[devid]; + pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); + if (pdev) + dev_data->alias = &pdev->dev; + + atomic_set(&dev_data->bind, 0); + + dev->archdata.iommu = dev_data; + + + return 0; +} + +static void iommu_uninit_device(struct device *dev) +{ + kfree(dev->archdata.iommu); +} #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -90,7 +186,6 @@ DECLARE_STATS_COUNTER(alloced_io_mem); DECLARE_STATS_COUNTER(total_map_requests); static struct dentry *stats_dir; -static struct dentry *de_isolate; static struct dentry *de_fflush; static void amd_iommu_stats_add(struct __iommu_counter *cnt) @@ -108,9 +203,6 @@ static void amd_iommu_stats_init(void) if (stats_dir == NULL) return; - de_isolate = debugfs_create_bool("isolation", 0444, stats_dir, - (u32 *)&amd_iommu_isolate); - de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, (u32 *)&amd_iommu_unmap_flush); @@ -130,12 +222,6 @@ static void amd_iommu_stats_init(void) #endif -/* returns !0 if the IOMMU is caching non-present entries in its TLB */ -static int iommu_has_npcache(struct amd_iommu *iommu) -{ - return iommu->cap & (1UL << IOMMU_CAP_NPCACHE); -} - /**************************************************************************** * * Interrupt handling functions @@ -199,6 +285,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) break; case EVENT_TYPE_ILL_CMD: printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); + iommu->reset_in_progress = true; reset_iommu_command_buffer(iommu); dump_command(address); break; @@ -321,11 +408,8 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu) status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - if (unlikely(i == EXIT_LOOP_COUNT)) { - spin_unlock(&iommu->lock); - reset_iommu_command_buffer(iommu); - spin_lock(&iommu->lock); - } + if (unlikely(i == EXIT_LOOP_COUNT)) + iommu->reset_in_progress = true; } /* @@ -372,26 +456,46 @@ static int iommu_completion_wait(struct amd_iommu *iommu) out: spin_unlock_irqrestore(&iommu->lock, flags); + if (iommu->reset_in_progress) + reset_iommu_command_buffer(iommu); + return 0; } +static void iommu_flush_complete(struct protection_domain *domain) +{ + int i; + + for (i = 0; i < amd_iommus_present; ++i) { + if (!domain->dev_iommu[i]) + continue; + + /* + * Devices of this domain are behind this IOMMU + * We need to wait for completion of all commands. + */ + iommu_completion_wait(amd_iommus[i]); + } +} + /* * Command send function for invalidating a device table entry */ -static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) +static int iommu_flush_device(struct device *dev) { + struct amd_iommu *iommu; struct iommu_cmd cmd; - int ret; + u16 devid; - BUG_ON(iommu == NULL); + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + /* Build command */ memset(&cmd, 0, sizeof(cmd)); CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); cmd.data[0] = devid; - ret = iommu_queue_command(iommu, &cmd); - - return ret; + return iommu_queue_command(iommu, &cmd); } static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, @@ -430,11 +534,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, * It invalidates a single PTE if the range to flush is within a single * page. Otherwise it flushes the whole TLB of the IOMMU. */ -static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, - u64 address, size_t size) +static void __iommu_flush_pages(struct protection_domain *domain, + u64 address, size_t size, int pde) { - int s = 0; - unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); + int s = 0, i; + unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE); address &= PAGE_MASK; @@ -447,142 +551,212 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, s = 1; } - iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s); - return 0; + for (i = 0; i < amd_iommus_present; ++i) { + if (!domain->dev_iommu[i]) + continue; + + /* + * Devices of this domain are behind this IOMMU + * We need a TLB flush + */ + iommu_queue_inv_iommu_pages(amd_iommus[i], address, + domain->id, pde, s); + } + + return; } -/* Flush the whole IO/TLB for a given protection domain */ -static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) +static void iommu_flush_pages(struct protection_domain *domain, + u64 address, size_t size) { - u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - - INC_STATS_COUNTER(domain_flush_single); + __iommu_flush_pages(domain, address, size, 0); +} - iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); +/* Flush the whole IO/TLB for a given protection domain */ +static void iommu_flush_tlb(struct protection_domain *domain) +{ + __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); } /* Flush the whole IO/TLB for a given protection domain - including PDE */ -static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) +static void iommu_flush_tlb_pde(struct protection_domain *domain) { - u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - - INC_STATS_COUNTER(domain_flush_single); - - iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1); + __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); } + /* - * This function flushes one domain on one IOMMU + * This function flushes the DTEs for all devices in domain */ -static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) +static void iommu_flush_domain_devices(struct protection_domain *domain) { - struct iommu_cmd cmd; + struct iommu_dev_data *dev_data; unsigned long flags; - __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - domid, 1, 1); + spin_lock_irqsave(&domain->lock, flags); - spin_lock_irqsave(&iommu->lock, flags); - __iommu_queue_command(iommu, &cmd); - __iommu_completion_wait(iommu); - __iommu_wait_for_completion(iommu); - spin_unlock_irqrestore(&iommu->lock, flags); + list_for_each_entry(dev_data, &domain->dev_list, list) + iommu_flush_device(dev_data->dev); + + spin_unlock_irqrestore(&domain->lock, flags); } -static void flush_all_domains_on_iommu(struct amd_iommu *iommu) +static void iommu_flush_all_domain_devices(void) { - int i; + struct protection_domain *domain; + unsigned long flags; - for (i = 1; i < MAX_DOMAIN_ID; ++i) { - if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) - continue; - flush_domain_on_iommu(iommu, i); + spin_lock_irqsave(&amd_iommu_pd_lock, flags); + + list_for_each_entry(domain, &amd_iommu_pd_list, list) { + iommu_flush_domain_devices(domain); + iommu_flush_complete(domain); } + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + +void amd_iommu_flush_all_devices(void) +{ + iommu_flush_all_domain_devices(); } /* - * This function is used to flush the IO/TLB for a given protection domain - * on every IOMMU in the system + * This function uses heavy locking and may disable irqs for some time. But + * this is no issue because it is only called during resume. */ -static void iommu_flush_domain(u16 domid) +void amd_iommu_flush_all_domains(void) { - struct amd_iommu *iommu; + struct protection_domain *domain; + unsigned long flags; - INC_STATS_COUNTER(domain_flush_all); + spin_lock_irqsave(&amd_iommu_pd_lock, flags); - for_each_iommu(iommu) - flush_domain_on_iommu(iommu, domid); + list_for_each_entry(domain, &amd_iommu_pd_list, list) { + spin_lock(&domain->lock); + iommu_flush_tlb_pde(domain); + iommu_flush_complete(domain); + spin_unlock(&domain->lock); + } + + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); } -void amd_iommu_flush_all_domains(void) +static void reset_iommu_command_buffer(struct amd_iommu *iommu) { - struct amd_iommu *iommu; + pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); - for_each_iommu(iommu) - flush_all_domains_on_iommu(iommu); + if (iommu->reset_in_progress) + panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); + + amd_iommu_reset_cmd_buffer(iommu); + amd_iommu_flush_all_devices(); + amd_iommu_flush_all_domains(); + + iommu->reset_in_progress = false; } -static void flush_all_devices_for_iommu(struct amd_iommu *iommu) +/**************************************************************************** + * + * The functions below are used the create the page table mappings for + * unity mapped regions. + * + ****************************************************************************/ + +/* + * This function is used to add another level to an IO page table. Adding + * another level increases the size of the address space by 9 bits to a size up + * to 64 bits. + */ +static bool increase_address_space(struct protection_domain *domain, + gfp_t gfp) { - int i; + u64 *pte; - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if (iommu != amd_iommu_rlookup_table[i]) - continue; + if (domain->mode == PAGE_MODE_6_LEVEL) + /* address space already 64 bit large */ + return false; - iommu_queue_inv_dev_entry(iommu, i); - iommu_completion_wait(iommu); - } + pte = (void *)get_zeroed_page(gfp); + if (!pte) + return false; + + *pte = PM_LEVEL_PDE(domain->mode, + virt_to_phys(domain->pt_root)); + domain->pt_root = pte; + domain->mode += 1; + domain->updated = true; + + return true; } -static void flush_devices_by_domain(struct protection_domain *domain) +static u64 *alloc_pte(struct protection_domain *domain, + unsigned long address, + int end_lvl, + u64 **pte_page, + gfp_t gfp) { - struct amd_iommu *iommu; - int i; + u64 *pte, *page; + int level; - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || - (amd_iommu_pd_table[i] != domain)) - continue; + while (address > PM_LEVEL_SIZE(domain->mode)) + increase_address_space(domain, gfp); - iommu = amd_iommu_rlookup_table[i]; - if (!iommu) - continue; + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; + + while (level > end_lvl) { + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); + } - iommu_queue_inv_dev_entry(iommu, i); - iommu_completion_wait(iommu); + level -= 1; + + pte = IOMMU_PTE_PAGE(*pte); + + if (pte_page && level == end_lvl) + *pte_page = pte; + + pte = &pte[PM_LEVEL_INDEX(level, address)]; } + + return pte; } -static void reset_iommu_command_buffer(struct amd_iommu *iommu) +/* + * This function checks if there is a PTE for a given dma address. If + * there is one, it returns the pointer to it. + */ +static u64 *fetch_pte(struct protection_domain *domain, + unsigned long address, int map_size) { - pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); + int level; + u64 *pte; - if (iommu->reset_in_progress) - panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - iommu->reset_in_progress = true; + while (level > map_size) { + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; - amd_iommu_reset_cmd_buffer(iommu); - flush_all_devices_for_iommu(iommu); - flush_all_domains_on_iommu(iommu); + level -= 1; - iommu->reset_in_progress = false; -} + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[PM_LEVEL_INDEX(level, address)]; -void amd_iommu_flush_all_devices(void) -{ - flush_devices_by_domain(NULL); -} + if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { + pte = NULL; + break; + } + } -/**************************************************************************** - * - * The functions below are used the create the page table mappings for - * unity mapped regions. - * - ****************************************************************************/ + return pte; +} /* * Generic mapping functions. It maps a physical address into a DMA @@ -654,28 +828,6 @@ static int iommu_for_unity_map(struct amd_iommu *iommu, } /* - * Init the unity mappings for a specific IOMMU in the system - * - * Basically iterates over all unity mapping entries and applies them to - * the default domain DMA of that IOMMU if necessary. - */ -static int iommu_init_unity_mappings(struct amd_iommu *iommu) -{ - struct unity_map_entry *entry; - int ret; - - list_for_each_entry(entry, &amd_iommu_unity_map, list) { - if (!iommu_for_unity_map(iommu, entry)) - continue; - ret = dma_ops_unity_map(iommu->default_dom, entry); - if (ret) - return ret; - } - - return 0; -} - -/* * This function actually applies the mapping to the page table of the * dma_ops domain. */ @@ -704,6 +856,28 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, } /* + * Init the unity mappings for a specific IOMMU in the system + * + * Basically iterates over all unity mapping entries and applies them to + * the default domain DMA of that IOMMU if necessary. + */ +static int iommu_init_unity_mappings(struct amd_iommu *iommu) +{ + struct unity_map_entry *entry; + int ret; + + list_for_each_entry(entry, &amd_iommu_unity_map, list) { + if (!iommu_for_unity_map(iommu, entry)) + continue; + ret = dma_ops_unity_map(iommu->default_dom, entry); + if (ret) + return ret; + } + + return 0; +} + +/* * Inits the unity mappings required for a specific device */ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, @@ -740,34 +914,23 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, */ /* - * This function checks if there is a PTE for a given dma address. If - * there is one, it returns the pointer to it. + * Used to reserve address ranges in the aperture (e.g. for exclusion + * ranges. */ -static u64 *fetch_pte(struct protection_domain *domain, - unsigned long address, int map_size) +static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, + unsigned long start_page, + unsigned int pages) { - int level; - u64 *pte; - - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - - while (level > map_size) { - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - level -= 1; + unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[PM_LEVEL_INDEX(level, address)]; + if (start_page + pages > last_page) + pages = last_page - start_page; - if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { - pte = NULL; - break; - } + for (i = start_page; i < start_page + pages; ++i) { + int index = i / APERTURE_RANGE_PAGES; + int page = i % APERTURE_RANGE_PAGES; + __set_bit(page, dom->aperture[index]->bitmap); } - - return pte; } /* @@ -775,11 +938,11 @@ static u64 *fetch_pte(struct protection_domain *domain, * aperture in case of dma_ops domain allocation or address allocation * failure. */ -static int alloc_new_range(struct amd_iommu *iommu, - struct dma_ops_domain *dma_dom, +static int alloc_new_range(struct dma_ops_domain *dma_dom, bool populate, gfp_t gfp) { int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; + struct amd_iommu *iommu; int i; #ifdef CONFIG_IOMMU_STRESS @@ -819,14 +982,17 @@ static int alloc_new_range(struct amd_iommu *iommu, dma_dom->aperture_size += APERTURE_RANGE_SIZE; /* Intialize the exclusion range if necessary */ - if (iommu->exclusion_start && - iommu->exclusion_start >= dma_dom->aperture[index]->offset && - iommu->exclusion_start < dma_dom->aperture_size) { - unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = iommu_num_pages(iommu->exclusion_start, - iommu->exclusion_length, - PAGE_SIZE); - dma_ops_reserve_addresses(dma_dom, startpage, pages); + for_each_iommu(iommu) { + if (iommu->exclusion_start && + iommu->exclusion_start >= dma_dom->aperture[index]->offset + && iommu->exclusion_start < dma_dom->aperture_size) { + unsigned long startpage; + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length, + PAGE_SIZE); + startpage = iommu->exclusion_start >> PAGE_SHIFT; + dma_ops_reserve_addresses(dma_dom, startpage, pages); + } } /* @@ -928,7 +1094,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, } if (unlikely(address == -1)) - address = bad_dma_address; + address = DMA_ERROR_CODE; WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); @@ -973,6 +1139,31 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, * ****************************************************************************/ +/* + * This function adds a protection domain to the global protection domain list + */ +static void add_domain_to_list(struct protection_domain *domain) +{ + unsigned long flags; + + spin_lock_irqsave(&amd_iommu_pd_lock, flags); + list_add(&domain->list, &amd_iommu_pd_list); + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + +/* + * This function removes a protection domain to the global + * protection domain list + */ +static void del_domain_from_list(struct protection_domain *domain) +{ + unsigned long flags; + + spin_lock_irqsave(&amd_iommu_pd_lock, flags); + list_del(&domain->list); + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + static u16 domain_id_alloc(void) { unsigned long flags; @@ -1000,26 +1191,6 @@ static void domain_id_free(int id) write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } -/* - * Used to reserve address ranges in the aperture (e.g. for exclusion - * ranges. - */ -static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, - unsigned long start_page, - unsigned int pages) -{ - unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; - - if (start_page + pages > last_page) - pages = last_page - start_page; - - for (i = start_page; i < start_page + pages; ++i) { - int index = i / APERTURE_RANGE_PAGES; - int page = i % APERTURE_RANGE_PAGES; - __set_bit(page, dom->aperture[index]->bitmap); - } -} - static void free_pagetable(struct protection_domain *domain) { int i, j; @@ -1061,6 +1232,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) if (!dom) return; + del_domain_from_list(&dom->domain); + free_pagetable(&dom->domain); for (i = 0; i < APERTURE_MAX_RANGES; ++i) { @@ -1078,7 +1251,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) * It also intializes the page table and the address allocator data * structures required for the dma_ops interface */ -static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) +static struct dma_ops_domain *dma_ops_domain_alloc(void) { struct dma_ops_domain *dma_dom; @@ -1091,6 +1264,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) dma_dom->domain.id = domain_id_alloc(); if (dma_dom->domain.id == 0) goto free_dma_dom; + INIT_LIST_HEAD(&dma_dom->domain.dev_list); dma_dom->domain.mode = PAGE_MODE_2_LEVEL; dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); dma_dom->domain.flags = PD_DMA_OPS_MASK; @@ -1101,7 +1275,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) dma_dom->need_flush = false; dma_dom->target_dev = 0xffff; - if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) + add_domain_to_list(&dma_dom->domain); + + if (alloc_new_range(dma_dom, true, GFP_KERNEL)) goto free_dma_dom; /* @@ -1129,22 +1305,6 @@ static bool dma_ops_domain(struct protection_domain *domain) return domain->flags & PD_DMA_OPS_MASK; } -/* - * Find out the protection domain structure for a given PCI device. This - * will give us the pointer to the page table root for example. - */ -static struct protection_domain *domain_for_device(u16 devid) -{ - struct protection_domain *dom; - unsigned long flags; - - read_lock_irqsave(&amd_iommu_devtable_lock, flags); - dom = amd_iommu_pd_table[devid]; - read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - - return dom; -} - static void set_dte_entry(u16 devid, struct protection_domain *domain) { u64 pte_root = virt_to_phys(domain->pt_root); @@ -1156,42 +1316,123 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain) amd_iommu_dev_table[devid].data[2] = domain->id; amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); +} + +static void clear_dte_entry(u16 devid) +{ + /* remove entry from the device table seen by the hardware */ + amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; + amd_iommu_dev_table[devid].data[1] = 0; + amd_iommu_dev_table[devid].data[2] = 0; - amd_iommu_pd_table[devid] = domain; + amd_iommu_apply_erratum_63(devid); +} + +static void do_attach(struct device *dev, struct protection_domain *domain) +{ + struct iommu_dev_data *dev_data; + struct amd_iommu *iommu; + u16 devid; + + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + dev_data = get_dev_data(dev); + + /* Update data structures */ + dev_data->domain = domain; + list_add(&dev_data->list, &domain->dev_list); + set_dte_entry(devid, domain); + + /* Do reference counting */ + domain->dev_iommu[iommu->index] += 1; + domain->dev_cnt += 1; + + /* Flush the DTE entry */ + iommu_flush_device(dev); +} + +static void do_detach(struct device *dev) +{ + struct iommu_dev_data *dev_data; + struct amd_iommu *iommu; + u16 devid; + + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + dev_data = get_dev_data(dev); + + /* decrease reference counters */ + dev_data->domain->dev_iommu[iommu->index] -= 1; + dev_data->domain->dev_cnt -= 1; + + /* Update data structures */ + dev_data->domain = NULL; + list_del(&dev_data->list); + clear_dte_entry(devid); + + /* Flush the DTE entry */ + iommu_flush_device(dev); } /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static void __attach_device(struct amd_iommu *iommu, - struct protection_domain *domain, - u16 devid) +static int __attach_device(struct device *dev, + struct protection_domain *domain) { + struct iommu_dev_data *dev_data, *alias_data; + + dev_data = get_dev_data(dev); + alias_data = get_dev_data(dev_data->alias); + + if (!alias_data) + return -EINVAL; + /* lock domain */ spin_lock(&domain->lock); - /* update DTE entry */ - set_dte_entry(devid, domain); + /* Some sanity checks */ + if (alias_data->domain != NULL && + alias_data->domain != domain) + return -EBUSY; + + if (dev_data->domain != NULL && + dev_data->domain != domain) + return -EBUSY; - domain->dev_cnt += 1; + /* Do real assignment */ + if (dev_data->alias != dev) { + alias_data = get_dev_data(dev_data->alias); + if (alias_data->domain == NULL) + do_attach(dev_data->alias, domain); + + atomic_inc(&alias_data->bind); + } + + if (dev_data->domain == NULL) + do_attach(dev, domain); + + atomic_inc(&dev_data->bind); /* ready */ spin_unlock(&domain->lock); + + return 0; } /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static void attach_device(struct amd_iommu *iommu, - struct protection_domain *domain, - u16 devid) +static int attach_device(struct device *dev, + struct protection_domain *domain) { unsigned long flags; + int ret; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - __attach_device(iommu, domain, devid); + ret = __attach_device(dev, domain); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); /* @@ -1199,96 +1440,125 @@ static void attach_device(struct amd_iommu *iommu, * left the caches in the IOMMU dirty. So we have to flush * here to evict all dirty stuff. */ - iommu_queue_inv_dev_entry(iommu, devid); - iommu_flush_tlb_pde(iommu, domain->id); + iommu_flush_tlb_pde(domain); + + return ret; } /* * Removes a device from a protection domain (unlocked) */ -static void __detach_device(struct protection_domain *domain, u16 devid) +static void __detach_device(struct device *dev) { + struct iommu_dev_data *dev_data = get_dev_data(dev); + struct iommu_dev_data *alias_data; + unsigned long flags; - /* lock domain */ - spin_lock(&domain->lock); + BUG_ON(!dev_data->domain); - /* remove domain from the lookup table */ - amd_iommu_pd_table[devid] = NULL; + spin_lock_irqsave(&dev_data->domain->lock, flags); - /* remove entry from the device table seen by the hardware */ - amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; - amd_iommu_dev_table[devid].data[1] = 0; - amd_iommu_dev_table[devid].data[2] = 0; + if (dev_data->alias != dev) { + alias_data = get_dev_data(dev_data->alias); + if (atomic_dec_and_test(&alias_data->bind)) + do_detach(dev_data->alias); + } - /* decrease reference counter */ - domain->dev_cnt -= 1; + if (atomic_dec_and_test(&dev_data->bind)) + do_detach(dev); - /* ready */ - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&dev_data->domain->lock, flags); /* * If we run in passthrough mode the device must be assigned to the * passthrough domain if it is detached from any other domain */ - if (iommu_pass_through) { - struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; - __attach_device(iommu, pt_domain, devid); - } + if (iommu_pass_through && dev_data->domain == NULL) + __attach_device(dev, pt_domain); } /* * Removes a device from a protection domain (with devtable_lock held) */ -static void detach_device(struct protection_domain *domain, u16 devid) +static void detach_device(struct device *dev) { unsigned long flags; /* lock device table */ write_lock_irqsave(&amd_iommu_devtable_lock, flags); - __detach_device(domain, devid); + __detach_device(dev); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } +/* + * Find out the protection domain structure for a given PCI device. This + * will give us the pointer to the page table root for example. + */ +static struct protection_domain *domain_for_device(struct device *dev) +{ + struct protection_domain *dom; + struct iommu_dev_data *dev_data, *alias_data; + unsigned long flags; + u16 devid, alias; + + devid = get_device_id(dev); + alias = amd_iommu_alias_table[devid]; + dev_data = get_dev_data(dev); + alias_data = get_dev_data(dev_data->alias); + if (!alias_data) + return NULL; + + read_lock_irqsave(&amd_iommu_devtable_lock, flags); + dom = dev_data->domain; + if (dom == NULL && + alias_data->domain != NULL) { + __attach_device(dev, alias_data->domain); + dom = alias_data->domain; + } + + read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + + return dom; +} + static int device_change_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; - struct pci_dev *pdev = to_pci_dev(dev); - u16 devid = calc_devid(pdev->bus->number, pdev->devfn); + u16 devid; struct protection_domain *domain; struct dma_ops_domain *dma_domain; struct amd_iommu *iommu; unsigned long flags; - if (devid > amd_iommu_last_bdf) - goto out; - - devid = amd_iommu_alias_table[devid]; - - iommu = amd_iommu_rlookup_table[devid]; - if (iommu == NULL) - goto out; - - domain = domain_for_device(devid); + if (!check_device(dev)) + return 0; - if (domain && !dma_ops_domain(domain)) - WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " - "to a non-dma-ops domain\n", dev_name(dev)); + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; switch (action) { case BUS_NOTIFY_UNBOUND_DRIVER: + + domain = domain_for_device(dev); + if (!domain) goto out; if (iommu_pass_through) break; - detach_device(domain, devid); + detach_device(dev); break; case BUS_NOTIFY_ADD_DEVICE: + + iommu_init_device(dev); + + domain = domain_for_device(dev); + /* allocate a protection domain if a device is added */ dma_domain = find_protection_domain(devid); if (dma_domain) goto out; - dma_domain = dma_ops_domain_alloc(iommu); + dma_domain = dma_ops_domain_alloc(); if (!dma_domain) goto out; dma_domain->target_dev = devid; @@ -1298,11 +1568,15 @@ static int device_change_notifier(struct notifier_block *nb, spin_unlock_irqrestore(&iommu_pd_list_lock, flags); break; + case BUS_NOTIFY_DEL_DEVICE: + + iommu_uninit_device(dev); + default: goto out; } - iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_device(dev); iommu_completion_wait(iommu); out: @@ -1320,106 +1594,46 @@ static struct notifier_block device_nb = { *****************************************************************************/ /* - * This function checks if the driver got a valid device from the caller to - * avoid dereferencing invalid pointers. - */ -static bool check_device(struct device *dev) -{ - if (!dev || !dev->dma_mask) - return false; - - return true; -} - -/* - * In this function the list of preallocated protection domains is traversed to - * find the domain for a specific device - */ -static struct dma_ops_domain *find_protection_domain(u16 devid) -{ - struct dma_ops_domain *entry, *ret = NULL; - unsigned long flags; - - if (list_empty(&iommu_pd_list)) - return NULL; - - spin_lock_irqsave(&iommu_pd_list_lock, flags); - - list_for_each_entry(entry, &iommu_pd_list, list) { - if (entry->target_dev == devid) { - ret = entry; - break; - } - } - - spin_unlock_irqrestore(&iommu_pd_list_lock, flags); - - return ret; -} - -/* * In the dma_ops path we only have the struct device. This function * finds the corresponding IOMMU, the protection domain and the * requestor id for a given device. * If the device is not yet associated with a domain this is also done * in this function. */ -static int get_device_resources(struct device *dev, - struct amd_iommu **iommu, - struct protection_domain **domain, - u16 *bdf) +static struct protection_domain *get_domain(struct device *dev) { + struct protection_domain *domain; struct dma_ops_domain *dma_dom; - struct pci_dev *pcidev; - u16 _bdf; - - *iommu = NULL; - *domain = NULL; - *bdf = 0xffff; - - if (dev->bus != &pci_bus_type) - return 0; + u16 devid = get_device_id(dev); - pcidev = to_pci_dev(dev); - _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - - /* device not translated by any IOMMU in the system? */ - if (_bdf > amd_iommu_last_bdf) - return 0; + if (!check_device(dev)) + return ERR_PTR(-EINVAL); - *bdf = amd_iommu_alias_table[_bdf]; + domain = domain_for_device(dev); + if (domain != NULL && !dma_ops_domain(domain)) + return ERR_PTR(-EBUSY); - *iommu = amd_iommu_rlookup_table[*bdf]; - if (*iommu == NULL) - return 0; - *domain = domain_for_device(*bdf); - if (*domain == NULL) { - dma_dom = find_protection_domain(*bdf); - if (!dma_dom) - dma_dom = (*iommu)->default_dom; - *domain = &dma_dom->domain; - attach_device(*iommu, *domain, *bdf); - DUMP_printk("Using protection domain %d for device %s\n", - (*domain)->id, dev_name(dev)); - } + if (domain != NULL) + return domain; - if (domain_for_device(_bdf) == NULL) - attach_device(*iommu, *domain, _bdf); + /* Device not bount yet - bind it */ + dma_dom = find_protection_domain(devid); + if (!dma_dom) + dma_dom = amd_iommu_rlookup_table[devid]->default_dom; + attach_device(dev, &dma_dom->domain); + DUMP_printk("Using protection domain %d for device %s\n", + dma_dom->domain.id, dev_name(dev)); - return 1; + return &dma_dom->domain; } static void update_device_table(struct protection_domain *domain) { - unsigned long flags; - int i; + struct iommu_dev_data *dev_data; - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if (amd_iommu_pd_table[i] != domain) - continue; - write_lock_irqsave(&amd_iommu_devtable_lock, flags); - set_dte_entry(i, domain); - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + list_for_each_entry(dev_data, &domain->dev_list, list) { + u16 devid = get_device_id(dev_data->dev); + set_dte_entry(devid, domain); } } @@ -1429,76 +1643,13 @@ static void update_domain(struct protection_domain *domain) return; update_device_table(domain); - flush_devices_by_domain(domain); - iommu_flush_domain(domain->id); + iommu_flush_domain_devices(domain); + iommu_flush_tlb_pde(domain); domain->updated = false; } /* - * This function is used to add another level to an IO page table. Adding - * another level increases the size of the address space by 9 bits to a size up - * to 64 bits. - */ -static bool increase_address_space(struct protection_domain *domain, - gfp_t gfp) -{ - u64 *pte; - - if (domain->mode == PAGE_MODE_6_LEVEL) - /* address space already 64 bit large */ - return false; - - pte = (void *)get_zeroed_page(gfp); - if (!pte) - return false; - - *pte = PM_LEVEL_PDE(domain->mode, - virt_to_phys(domain->pt_root)); - domain->pt_root = pte; - domain->mode += 1; - domain->updated = true; - - return true; -} - -static u64 *alloc_pte(struct protection_domain *domain, - unsigned long address, - int end_lvl, - u64 **pte_page, - gfp_t gfp) -{ - u64 *pte, *page; - int level; - - while (address > PM_LEVEL_SIZE(domain->mode)) - increase_address_space(domain, gfp); - - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - - while (level > end_lvl) { - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(gfp); - if (!page) - return NULL; - *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); - } - - level -= 1; - - pte = IOMMU_PTE_PAGE(*pte); - - if (pte_page && level == end_lvl) - *pte_page = pte; - - pte = &pte[PM_LEVEL_INDEX(level, address)]; - } - - return pte; -} - -/* * This function fetches the PTE for a given address in the aperture */ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, @@ -1528,8 +1679,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, * This is the generic map function. It maps one 4kb page at paddr to * the given address in the DMA address space for the domain. */ -static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, - struct dma_ops_domain *dom, +static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom, unsigned long address, phys_addr_t paddr, int direction) @@ -1542,7 +1692,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, pte = dma_ops_get_pte(dom, address); if (!pte) - return bad_dma_address; + return DMA_ERROR_CODE; __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; @@ -1563,8 +1713,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, /* * The generic unmapping function for on page in the DMA address space. */ -static void dma_ops_domain_unmap(struct amd_iommu *iommu, - struct dma_ops_domain *dom, +static void dma_ops_domain_unmap(struct dma_ops_domain *dom, unsigned long address) { struct aperture_range *aperture; @@ -1595,7 +1744,6 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, * Must be called with the domain lock held. */ static dma_addr_t __map_single(struct device *dev, - struct amd_iommu *iommu, struct dma_ops_domain *dma_dom, phys_addr_t paddr, size_t size, @@ -1623,7 +1771,7 @@ static dma_addr_t __map_single(struct device *dev, retry: address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, dma_mask); - if (unlikely(address == bad_dma_address)) { + if (unlikely(address == DMA_ERROR_CODE)) { /* * setting next_address here will let the address * allocator only scan the new allocated range in the @@ -1631,7 +1779,7 @@ retry: */ dma_dom->next_address = dma_dom->aperture_size; - if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) + if (alloc_new_range(dma_dom, false, GFP_ATOMIC)) goto out; /* @@ -1643,8 +1791,8 @@ retry: start = address; for (i = 0; i < pages; ++i) { - ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); - if (ret == bad_dma_address) + ret = dma_ops_domain_map(dma_dom, start, paddr, dir); + if (ret == DMA_ERROR_CODE) goto out_unmap; paddr += PAGE_SIZE; @@ -1655,10 +1803,10 @@ retry: ADD_STATS_COUNTER(alloced_io_mem, size); if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { - iommu_flush_tlb(iommu, dma_dom->domain.id); + iommu_flush_tlb(&dma_dom->domain); dma_dom->need_flush = false; - } else if (unlikely(iommu_has_npcache(iommu))) - iommu_flush_pages(iommu, dma_dom->domain.id, address, size); + } else if (unlikely(amd_iommu_np_cache)) + iommu_flush_pages(&dma_dom->domain, address, size); out: return address; @@ -1667,20 +1815,19 @@ out_unmap: for (--i; i >= 0; --i) { start -= PAGE_SIZE; - dma_ops_domain_unmap(iommu, dma_dom, start); + dma_ops_domain_unmap(dma_dom, start); } dma_ops_free_addresses(dma_dom, address, pages); - return bad_dma_address; + return DMA_ERROR_CODE; } /* * Does the reverse of the __map_single function. Must be called with * the domain lock held too */ -static void __unmap_single(struct amd_iommu *iommu, - struct dma_ops_domain *dma_dom, +static void __unmap_single(struct dma_ops_domain *dma_dom, dma_addr_t dma_addr, size_t size, int dir) @@ -1688,7 +1835,7 @@ static void __unmap_single(struct amd_iommu *iommu, dma_addr_t i, start; unsigned int pages; - if ((dma_addr == bad_dma_address) || + if ((dma_addr == DMA_ERROR_CODE) || (dma_addr + size > dma_dom->aperture_size)) return; @@ -1697,7 +1844,7 @@ static void __unmap_single(struct amd_iommu *iommu, start = dma_addr; for (i = 0; i < pages; ++i) { - dma_ops_domain_unmap(iommu, dma_dom, start); + dma_ops_domain_unmap(dma_dom, start); start += PAGE_SIZE; } @@ -1706,7 +1853,7 @@ static void __unmap_single(struct amd_iommu *iommu, dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { - iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); + iommu_flush_pages(&dma_dom->domain, dma_addr, size); dma_dom->need_flush = false; } } @@ -1720,36 +1867,29 @@ static dma_addr_t map_page(struct device *dev, struct page *page, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; dma_addr_t addr; u64 dma_mask; phys_addr_t paddr = page_to_phys(page) + offset; INC_STATS_COUNTER(cnt_map_single); - if (!check_device(dev)) - return bad_dma_address; - - dma_mask = *dev->dma_mask; - - get_device_resources(dev, &iommu, &domain, &devid); - - if (iommu == NULL || domain == NULL) - /* device not handled by any AMD IOMMU */ + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) return (dma_addr_t)paddr; + else if (IS_ERR(domain)) + return DMA_ERROR_CODE; - if (!dma_ops_domain(domain)) - return bad_dma_address; + dma_mask = *dev->dma_mask; spin_lock_irqsave(&domain->lock, flags); - addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, + + addr = __map_single(dev, domain->priv, paddr, size, dir, false, dma_mask); - if (addr == bad_dma_address) + if (addr == DMA_ERROR_CODE) goto out; - iommu_completion_wait(iommu); + iommu_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -1764,25 +1904,19 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; INC_STATS_COUNTER(cnt_unmap_single); - if (!check_device(dev) || - !get_device_resources(dev, &iommu, &domain, &devid)) - /* device not handled by any AMD IOMMU */ - return; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) return; spin_lock_irqsave(&domain->lock, flags); - __unmap_single(iommu, domain->priv, dma_addr, size, dir); + __unmap_single(domain->priv, dma_addr, size, dir); - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -1814,9 +1948,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; int i; struct scatterlist *s; phys_addr_t paddr; @@ -1825,25 +1957,20 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, INC_STATS_COUNTER(cnt_map_sg); - if (!check_device(dev)) + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) + return map_sg_no_iommu(dev, sglist, nelems, dir); + else if (IS_ERR(domain)) return 0; dma_mask = *dev->dma_mask; - get_device_resources(dev, &iommu, &domain, &devid); - - if (!iommu || !domain) - return map_sg_no_iommu(dev, sglist, nelems, dir); - - if (!dma_ops_domain(domain)) - return 0; - spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { paddr = sg_phys(s); - s->dma_address = __map_single(dev, iommu, domain->priv, + s->dma_address = __map_single(dev, domain->priv, paddr, s->length, dir, false, dma_mask); @@ -1854,7 +1981,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, goto unmap; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -1863,7 +1990,7 @@ out: unmap: for_each_sg(sglist, s, mapped_elems, i) { if (s->dma_address) - __unmap_single(iommu, domain->priv, s->dma_address, + __unmap_single(domain->priv, s->dma_address, s->dma_length, dir); s->dma_address = s->dma_length = 0; } @@ -1882,30 +2009,25 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; struct scatterlist *s; - u16 devid; int i; INC_STATS_COUNTER(cnt_unmap_sg); - if (!check_device(dev) || - !get_device_resources(dev, &iommu, &domain, &devid)) - return; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) return; spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { - __unmap_single(iommu, domain->priv, s->dma_address, + __unmap_single(domain->priv, s->dma_address, s->dma_length, dir); s->dma_address = s->dma_length = 0; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -1918,49 +2040,44 @@ static void *alloc_coherent(struct device *dev, size_t size, { unsigned long flags; void *virt_addr; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; phys_addr_t paddr; u64 dma_mask = dev->coherent_dma_mask; INC_STATS_COUNTER(cnt_alloc_coherent); - if (!check_device(dev)) + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) { + virt_addr = (void *)__get_free_pages(flag, get_order(size)); + *dma_addr = __pa(virt_addr); + return virt_addr; + } else if (IS_ERR(domain)) return NULL; - if (!get_device_resources(dev, &iommu, &domain, &devid)) - flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + dma_mask = dev->coherent_dma_mask; + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + flag |= __GFP_ZERO; - flag |= __GFP_ZERO; virt_addr = (void *)__get_free_pages(flag, get_order(size)); if (!virt_addr) return NULL; paddr = virt_to_phys(virt_addr); - if (!iommu || !domain) { - *dma_addr = (dma_addr_t)paddr; - return virt_addr; - } - - if (!dma_ops_domain(domain)) - goto out_free; - if (!dma_mask) dma_mask = *dev->dma_mask; spin_lock_irqsave(&domain->lock, flags); - *dma_addr = __map_single(dev, iommu, domain->priv, paddr, + *dma_addr = __map_single(dev, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); - if (*dma_addr == bad_dma_address) { + if (*dma_addr == DMA_ERROR_CODE) { spin_unlock_irqrestore(&domain->lock, flags); goto out_free; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); @@ -1980,28 +2097,19 @@ static void free_coherent(struct device *dev, size_t size, void *virt_addr, dma_addr_t dma_addr) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; INC_STATS_COUNTER(cnt_free_coherent); - if (!check_device(dev)) - return; - - get_device_resources(dev, &iommu, &domain, &devid); - - if (!iommu || !domain) - goto free_mem; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) goto free_mem; spin_lock_irqsave(&domain->lock, flags); - __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); + __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); @@ -2015,22 +2123,7 @@ free_mem: */ static int amd_iommu_dma_supported(struct device *dev, u64 mask) { - u16 bdf; - struct pci_dev *pcidev; - - /* No device or no PCI device */ - if (!dev || dev->bus != &pci_bus_type) - return 0; - - pcidev = to_pci_dev(dev); - - bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - - /* Out of our scope? */ - if (bdf > amd_iommu_last_bdf) - return 0; - - return 1; + return check_device(dev); } /* @@ -2044,25 +2137,30 @@ static void prealloc_protection_domains(void) { struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; - struct amd_iommu *iommu; u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - devid = calc_devid(dev->bus->number, dev->devfn); - if (devid > amd_iommu_last_bdf) - continue; - devid = amd_iommu_alias_table[devid]; - if (domain_for_device(devid)) + + /* Do we handle this device? */ + if (!check_device(&dev->dev)) continue; - iommu = amd_iommu_rlookup_table[devid]; - if (!iommu) + + iommu_init_device(&dev->dev); + + /* Is there already any domain for it? */ + if (domain_for_device(&dev->dev)) continue; - dma_dom = dma_ops_domain_alloc(iommu); + + devid = get_device_id(&dev->dev); + + dma_dom = dma_ops_domain_alloc(); if (!dma_dom) continue; init_unity_mappings_for_device(dma_dom, devid); dma_dom->target_dev = devid; + attach_device(&dev->dev, &dma_dom->domain); + list_add_tail(&dma_dom->list, &iommu_pd_list); } } @@ -2091,7 +2189,7 @@ int __init amd_iommu_init_dma_ops(void) * protection domain will be assigned to the default one. */ for_each_iommu(iommu) { - iommu->default_dom = dma_ops_domain_alloc(iommu); + iommu->default_dom = dma_ops_domain_alloc(); if (iommu->default_dom == NULL) return -ENOMEM; iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; @@ -2101,15 +2199,12 @@ int __init amd_iommu_init_dma_ops(void) } /* - * If device isolation is enabled, pre-allocate the protection - * domains for each device. + * Pre-allocate the protection domains for each device. */ - if (amd_iommu_isolate) - prealloc_protection_domains(); + prealloc_protection_domains(); iommu_detected = 1; - force_iommu = 1; - bad_dma_address = 0; + swiotlb = 0; #ifdef CONFIG_GART_IOMMU gart_iommu_aperture_disabled = 1; gart_iommu_aperture = 0; @@ -2148,14 +2243,17 @@ free_domains: static void cleanup_domain(struct protection_domain *domain) { + struct iommu_dev_data *dev_data, *next; unsigned long flags; - u16 devid; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) - if (amd_iommu_pd_table[devid] == domain) - __detach_device(domain, devid); + list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { + struct device *dev = dev_data->dev; + + do_detach(dev); + atomic_set(&dev_data->bind, 0); + } write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } @@ -2165,6 +2263,8 @@ static void protection_domain_free(struct protection_domain *domain) if (!domain) return; + del_domain_from_list(domain); + if (domain->id) domain_id_free(domain->id); @@ -2183,6 +2283,9 @@ static struct protection_domain *protection_domain_alloc(void) domain->id = domain_id_alloc(); if (!domain->id) goto out_err; + INIT_LIST_HEAD(&domain->dev_list); + + add_domain_to_list(domain); return domain; @@ -2239,26 +2342,23 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom) static void amd_iommu_detach_device(struct iommu_domain *dom, struct device *dev) { - struct protection_domain *domain = dom->priv; + struct iommu_dev_data *dev_data = dev->archdata.iommu; struct amd_iommu *iommu; - struct pci_dev *pdev; u16 devid; - if (dev->bus != &pci_bus_type) + if (!check_device(dev)) return; - pdev = to_pci_dev(dev); - - devid = calc_devid(pdev->bus->number, pdev->devfn); + devid = get_device_id(dev); - if (devid > 0) - detach_device(domain, devid); + if (dev_data->domain != NULL) + detach_device(dev); iommu = amd_iommu_rlookup_table[devid]; if (!iommu) return; - iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_device(dev); iommu_completion_wait(iommu); } @@ -2266,35 +2366,30 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev) { struct protection_domain *domain = dom->priv; - struct protection_domain *old_domain; + struct iommu_dev_data *dev_data; struct amd_iommu *iommu; - struct pci_dev *pdev; + int ret; u16 devid; - if (dev->bus != &pci_bus_type) + if (!check_device(dev)) return -EINVAL; - pdev = to_pci_dev(dev); + dev_data = dev->archdata.iommu; - devid = calc_devid(pdev->bus->number, pdev->devfn); - - if (devid >= amd_iommu_last_bdf || - devid != amd_iommu_alias_table[devid]) - return -EINVAL; + devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; if (!iommu) return -EINVAL; - old_domain = domain_for_device(devid); - if (old_domain) - detach_device(old_domain, devid); + if (dev_data->domain) + detach_device(dev); - attach_device(iommu, domain, devid); + ret = attach_device(dev, domain); iommu_completion_wait(iommu); - return 0; + return ret; } static int amd_iommu_map_range(struct iommu_domain *dom, @@ -2340,7 +2435,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom, iova += PAGE_SIZE; } - iommu_flush_domain(domain->id); + iommu_flush_tlb_pde(domain); } static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, @@ -2391,8 +2486,9 @@ static struct iommu_ops amd_iommu_ops = { int __init amd_iommu_init_passthrough(void) { + struct amd_iommu *iommu; struct pci_dev *dev = NULL; - u16 devid, devid2; + u16 devid; /* allocate passthroug domain */ pt_domain = protection_domain_alloc(); @@ -2402,20 +2498,17 @@ int __init amd_iommu_init_passthrough(void) pt_domain->mode |= PAGE_MODE_NONE; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - struct amd_iommu *iommu; - devid = calc_devid(dev->bus->number, dev->devfn); - if (devid > amd_iommu_last_bdf) + if (!check_device(&dev->dev)) continue; - devid2 = amd_iommu_alias_table[devid]; + devid = get_device_id(&dev->dev); - iommu = amd_iommu_rlookup_table[devid2]; + iommu = amd_iommu_rlookup_table[devid]; if (!iommu) continue; - __attach_device(iommu, pt_domain, devid); - __attach_device(iommu, pt_domain, devid2); + attach_device(&dev->dev, pt_domain); } pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index b4b61d462dc..7ffc3996523 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -25,10 +25,12 @@ #include <linux/interrupt.h> #include <linux/msi.h> #include <asm/pci-direct.h> +#include <asm/amd_iommu_proto.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> #include <asm/iommu.h> #include <asm/gart.h> +#include <asm/x86_init.h> /* * definitions for the ACPI scanning code @@ -123,18 +125,24 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ -#ifdef CONFIG_IOMMU_STRESS -bool amd_iommu_isolate = false; -#else -bool amd_iommu_isolate = true; /* if true, device isolation is - enabled */ -#endif - bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ +/* Array to assign indices to IOMMUs*/ +struct amd_iommu *amd_iommus[MAX_IOMMUS]; +int amd_iommus_present; + +/* IOMMUs have a non-present cache? */ +bool amd_iommu_np_cache __read_mostly; + +/* + * List of protection domains - used during resume + */ +LIST_HEAD(amd_iommu_pd_list); +spinlock_t amd_iommu_pd_lock; + /* * Pointer to the device table which is shared by all AMD IOMMUs * it is indexed by the PCI device id or the HT unit id and contains @@ -157,12 +165,6 @@ u16 *amd_iommu_alias_table; struct amd_iommu **amd_iommu_rlookup_table; /* - * The pd table (protection domain table) is used to find the protection domain - * data structure a device belongs to. Indexed with the PCI device id too. - */ -struct protection_domain **amd_iommu_pd_table; - -/* * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap * to know which ones are already in use. */ @@ -240,7 +242,7 @@ static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } -static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) +static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; @@ -519,6 +521,26 @@ static void set_dev_entry_bit(u16 devid, u8 bit) amd_iommu_dev_table[devid].data[i] |= (1 << _bit); } +static int get_dev_entry_bit(u16 devid, u8 bit) +{ + int i = (bit >> 5) & 0x07; + int _bit = bit & 0x1f; + + return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit; +} + + +void amd_iommu_apply_erratum_63(u16 devid) +{ + int sysmgt; + + sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) | + (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1); + + if (sysmgt == 0x01) + set_dev_entry_bit(devid, DEV_ENTRY_IW); +} + /* Writes the specific IOMMU for a device into the rlookup table */ static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) { @@ -547,6 +569,8 @@ static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, if (flags & ACPI_DEVFLAG_LINT1) set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); + amd_iommu_apply_erratum_63(devid); + set_iommu_for_device(iommu, devid); } @@ -816,7 +840,18 @@ static void __init free_iommu_all(void) static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) { spin_lock_init(&iommu->lock); + + /* Add IOMMU to internal data structures */ list_add_tail(&iommu->list, &amd_iommu_list); + iommu->index = amd_iommus_present++; + + if (unlikely(iommu->index >= MAX_IOMMUS)) { + WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n"); + return -ENOSYS; + } + + /* Index is fine - add IOMMU to the array */ + amd_iommus[iommu->index] = iommu; /* * Copy data from ACPI table entry to the iommu struct @@ -846,6 +881,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) init_iommu_from_acpi(iommu, h); init_iommu_devices(iommu); + if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) + amd_iommu_np_cache = true; + return pci_enable_device(iommu->dev); } @@ -903,7 +941,7 @@ static int __init init_iommu_all(struct acpi_table_header *table) * ****************************************************************************/ -static int __init iommu_setup_msi(struct amd_iommu *iommu) +static int iommu_setup_msi(struct amd_iommu *iommu) { int r; @@ -1154,19 +1192,10 @@ static struct sys_device device_amd_iommu = { * functions. Finally it prints some information about AMD IOMMUs and * the driver state and enables the hardware. */ -int __init amd_iommu_init(void) +static int __init amd_iommu_init(void) { int i, ret = 0; - - if (no_iommu) { - printk(KERN_INFO "AMD-Vi disabled by kernel command line\n"); - return 0; - } - - if (!amd_iommu_detected) - return -ENODEV; - /* * First parse ACPI tables to find the largest Bus/Dev/Func * we need to handle. Upon this information the shared data @@ -1203,15 +1232,6 @@ int __init amd_iommu_init(void) if (amd_iommu_rlookup_table == NULL) goto free; - /* - * Protection Domain table - maps devices to protection domains - * This table has the same size as the rlookup_table - */ - amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(rlookup_table_size)); - if (amd_iommu_pd_table == NULL) - goto free; - amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( GFP_KERNEL | __GFP_ZERO, get_order(MAX_DOMAIN_ID/8)); @@ -1233,6 +1253,8 @@ int __init amd_iommu_init(void) */ amd_iommu_pd_alloc_bitmap[0] = 1; + spin_lock_init(&amd_iommu_pd_lock); + /* * now the data structures are allocated and basically initialized * start the real acpi table scan @@ -1264,17 +1286,12 @@ int __init amd_iommu_init(void) if (iommu_pass_through) goto out; - printk(KERN_INFO "AMD-Vi: device isolation "); - if (amd_iommu_isolate) - printk("enabled\n"); - else - printk("disabled\n"); - if (amd_iommu_unmap_flush) printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); else printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); + x86_platform.iommu_shutdown = disable_iommus; out: return ret; @@ -1282,9 +1299,6 @@ free: free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, get_order(MAX_DOMAIN_ID/8)); - free_pages((unsigned long)amd_iommu_pd_table, - get_order(rlookup_table_size)); - free_pages((unsigned long)amd_iommu_rlookup_table, get_order(rlookup_table_size)); @@ -1301,11 +1315,6 @@ free: goto out; } -void amd_iommu_shutdown(void) -{ - disable_iommus(); -} - /**************************************************************************** * * Early detect code. This code runs at IOMMU detection time in the DMA @@ -1320,16 +1329,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) void __init amd_iommu_detect(void) { - if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) + if (no_iommu || (iommu_detected && !gart_iommu_aperture)) return; if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { iommu_detected = 1; amd_iommu_detected = 1; -#ifdef CONFIG_GART_IOMMU - gart_iommu_aperture_disabled = 1; - gart_iommu_aperture = 0; -#endif + x86_init.iommu.iommu_init = amd_iommu_init; } } @@ -1350,10 +1356,6 @@ static int __init parse_amd_iommu_dump(char *str) static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { - if (strncmp(str, "isolate", 7) == 0) - amd_iommu_isolate = true; - if (strncmp(str, "share", 5) == 0) - amd_iommu_isolate = false; if (strncmp(str, "fullflush", 9) == 0) amd_iommu_unmap_flush = true; } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 128111d8ffe..e0dfb6856aa 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -28,6 +28,7 @@ #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/k8.h> +#include <asm/x86_init.h> int gart_iommu_aperture; int gart_iommu_aperture_disabled __initdata; @@ -400,6 +401,7 @@ void __init gart_iommu_hole_init(void) iommu_detected = 1; gart_iommu_aperture = 1; + x86_init.iommu.iommu_init = gart_iommu_init; aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; @@ -456,7 +458,7 @@ out: if (aper_alloc) { /* Got the aperture from the AGP bridge */ - } else if (swiotlb && !valid_agp) { + } else if (!valid_agp) { /* Do nothing */ } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || force_iommu || diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index da7b7b9f8bd..565c1bfc507 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,7 +2,7 @@ # Makefile for local APIC drivers and for the IO-APIC code # -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 159740decc4..ad8c75b9e45 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,7 +14,7 @@ * Mikael Pettersson : PM converted to driver model. */ -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/kernel_stat.h> #include <linux/mc146818rtc.h> #include <linux/acpi_pmtmr.h> @@ -35,7 +35,8 @@ #include <linux/smp.h> #include <linux/mm.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h> +#include <asm/x86_init.h> #include <asm/pgalloc.h> #include <asm/atomic.h> #include <asm/mpspec.h> @@ -61,7 +62,7 @@ unsigned int boot_cpu_physical_apicid = -1U; /* * The highest APIC ID seen during enumeration. * - * This determines the messaging protocol we can use: if all APIC IDs + * On AMD, this determines the messaging protocol we can use: if all APIC IDs * are in the 0 ... 7 range, then we can use logical addressing which * has some performance advantages (better broadcasting). * @@ -240,28 +241,13 @@ static int modern_apic(void) } /* - * bare function to substitute write operation - * and it's _that_ fast :) - */ -static void native_apic_write_dummy(u32 reg, u32 v) -{ - WARN_ON_ONCE((cpu_has_apic || !disable_apic)); -} - -static u32 native_apic_read_dummy(u32 reg) -{ - WARN_ON_ONCE((cpu_has_apic && !disable_apic)); - return 0; -} - -/* - * right after this call apic->write/read doesn't do anything - * note that there is no restore operation it works one way + * right after this call apic become NOOP driven + * so apic->write/read doesn't do anything */ void apic_disable(void) { - apic->read = native_apic_read_dummy; - apic->write = native_apic_write_dummy; + pr_info("APIC: switched to apic NOOP\n"); + apic = &apic_noop; } void native_apic_wait_icr_idle(void) @@ -458,7 +444,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write(APIC_LVTT, v); - apic_write(APIC_TMICT, 0xffffffff); + apic_write(APIC_TMICT, 0); break; case CLOCK_EVT_MODE_RESUME: /* Nothing to do here */ @@ -978,7 +964,7 @@ void lapic_shutdown(void) { unsigned long flags; - if (!cpu_has_apic) + if (!cpu_has_apic && !apic_from_smp_config()) return; local_irq_save(flags); @@ -1188,7 +1174,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - perf_counters_lapic_init(); + perf_events_lapic_init(); preempt_disable(); @@ -1196,8 +1182,7 @@ void __cpuinit setup_local_APIC(void) * Double-check whether this APIC is really registered. * This is meaningless in clustered apic mode, so we skip it. */ - if (!apic->apic_id_registered()) - BUG(); + BUG_ON(!apic->apic_id_registered()); /* * Intel recommends to set DFR, LDR and TPR before enabling @@ -1392,14 +1377,11 @@ void __init enable_IR_x2apic(void) unsigned long flags; struct IO_APIC_route_entry **ioapic_entries = NULL; int ret, x2apic_enabled = 0; - int dmar_table_init_ret = 0; + int dmar_table_init_ret; -#ifdef CONFIG_INTR_REMAP dmar_table_init_ret = dmar_table_init(); - if (dmar_table_init_ret) - pr_debug("dmar_table_init() failed with %d:\n", - dmar_table_init_ret); -#endif + if (dmar_table_init_ret && !x2apic_supported()) + return; ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { @@ -1709,7 +1691,7 @@ int __init APIC_init_uniprocessor(void) localise_nmi_watchdog(); #endif - setup_boot_clock(); + x86_init.timers.setup_percpu_clockev(); #ifdef CONFIG_X86_64 check_nmi_watchdog(); #endif @@ -1916,24 +1898,14 @@ void __cpuinit generic_processor_info(int apicid, int version) max_physical_apicid = apicid; #ifdef CONFIG_X86_32 - /* - * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y - * but we need to work other dependencies like SMP_SUSPEND etc - * before this can be done without some confusion. - * if (CPU_HOTPLUG_ENABLED || num_processors > 8) - * - Ashok Raj <ashok.raj@intel.com> - */ - if (max_physical_apicid >= 8) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; - break; - } - /* If P4 and above fall through */ - case X86_VENDOR_AMD: + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (num_processors > 8) + def_to_bigsmp = 1; + break; + case X86_VENDOR_AMD: + if (max_physical_apicid >= 8) def_to_bigsmp = 1; - } } #endif diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c new file mode 100644 index 00000000000..d9acc3bee0f --- /dev/null +++ b/arch/x86/kernel/apic/apic_noop.c @@ -0,0 +1,200 @@ +/* + * NOOP APIC driver. + * + * Does almost nothing and should be substituted by a real apic driver via + * probe routine. + * + * Though in case if apic is disabled (for some reason) we try + * to not uglify the caller's code and allow to call (some) apic routines + * like self-ipi, etc... + */ + +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <asm/fixmap.h> +#include <asm/mpspec.h> +#include <asm/apicdef.h> +#include <asm/apic.h> +#include <asm/setup.h> + +#include <linux/smp.h> +#include <asm/ipi.h> + +#include <linux/interrupt.h> +#include <asm/acpi.h> +#include <asm/e820.h> + +static void noop_init_apic_ldr(void) { } +static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { } +static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { } +static void noop_send_IPI_allbutself(int vector) { } +static void noop_send_IPI_all(int vector) { } +static void noop_send_IPI_self(int vector) { } +static void noop_apic_wait_icr_idle(void) { } +static void noop_apic_icr_write(u32 low, u32 id) { } + +static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) +{ + return -1; +} + +static u32 noop_safe_apic_wait_icr_idle(void) +{ + return 0; +} + +static u64 noop_apic_icr_read(void) +{ + return 0; +} + +static int noop_cpu_to_logical_apicid(int cpu) +{ + return 0; +} + +static int noop_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return 0; +} + +static unsigned int noop_get_apic_id(unsigned long x) +{ + return 0; +} + +static int noop_probe(void) +{ + /* + * NOOP apic should not ever be + * enabled via probe routine + */ + return 0; +} + +static int noop_apic_id_registered(void) +{ + /* + * if we would be really "pedantic" + * we should pass read_apic_id() here + * but since NOOP suppose APIC ID = 0 + * lets save a few cycles + */ + return physid_isset(0, phys_cpu_present_map); +} + +static const struct cpumask *noop_target_cpus(void) +{ + /* only BSP here */ + return cpumask_of(0); +} + +static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid) +{ + return physid_isset(apicid, *map); +} + +static unsigned long noop_check_apicid_present(int bit) +{ + return physid_isset(bit, phys_cpu_present_map); +} + +static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + if (cpu != 0) + pr_warning("APIC: Vector allocated for non-BSP cpu\n"); + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); +} + +int noop_apicid_to_node(int logical_apicid) +{ + /* we're always on node 0 */ + return 0; +} + +static u32 noop_apic_read(u32 reg) +{ + WARN_ON_ONCE((cpu_has_apic && !disable_apic)); + return 0; +} + +static void noop_apic_write(u32 reg, u32 v) +{ + WARN_ON_ONCE((cpu_has_apic || !disable_apic)); +} + +struct apic apic_noop = { + .name = "noop", + .probe = noop_probe, + .acpi_madt_oem_check = NULL, + + .apic_id_registered = noop_apic_id_registered, + + .irq_delivery_mode = dest_LowestPrio, + /* logical delivery broadcast to all CPUs: */ + .irq_dest_mode = 1, + + .target_cpus = noop_target_cpus, + .disable_esr = 0, + .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = noop_check_apicid_used, + .check_apicid_present = noop_check_apicid_present, + + .vector_allocation_domain = noop_vector_allocation_domain, + .init_apic_ldr = noop_init_apic_ldr, + + .ioapic_phys_id_map = default_ioapic_phys_id_map, + .setup_apic_routing = NULL, + .multi_timer_check = NULL, + .apicid_to_node = noop_apicid_to_node, + + .cpu_to_logical_apicid = noop_cpu_to_logical_apicid, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = physid_set_mask_of_physid, + + .setup_portio_remap = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .enable_apic_mode = NULL, + + .phys_pkg_id = noop_phys_pkg_id, + + .mps_oem_check = NULL, + + .get_apic_id = noop_get_apic_id, + .set_apic_id = NULL, + .apic_id_mask = 0x0F << 24, + + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + + .send_IPI_mask = noop_send_IPI_mask, + .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, + .send_IPI_allbutself = noop_send_IPI_allbutself, + .send_IPI_all = noop_send_IPI_all, + .send_IPI_self = noop_send_IPI_self, + + .wakeup_secondary_cpu = noop_wakeup_secondary_cpu, + + /* should be safe */ + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + + .wait_for_init_deassert = NULL, + + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = NULL, + + .read = noop_apic_read, + .write = noop_apic_write, + .icr_read = noop_apic_icr_read, + .icr_write = noop_apic_icr_write, + .wait_icr_idle = noop_apic_wait_icr_idle, + .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, +}; diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 676cdac385c..38dcecfa581 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -35,7 +35,7 @@ static const struct cpumask *bigsmp_target_cpus(void) #endif } -static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) { return 0; } @@ -93,11 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid) -{ - return physid_mask_of_physid(phys_apicid); -} - /* Mapping from cpu number to logical apicid */ static inline int bigsmp_cpu_to_logical_apicid(int cpu) { @@ -106,13 +101,13 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu) return cpu_physical_id(cpu); } -static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) +static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ - return physids_promote(0xFFL); + physids_promote(0xFFL, retmap); } -static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid) +static int bigsmp_check_phys_apicid_present(int phys_apicid) { return 1; } @@ -230,7 +225,7 @@ struct apic apic_bigsmp = { .apicid_to_node = bigsmp_apicid_to_node, .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, - .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present, + .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, .check_phys_apicid_present = bigsmp_check_phys_apicid_present, .enable_apic_mode = NULL, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 89174f847b4..e85f8fb7f8e 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -466,11 +466,11 @@ static const struct cpumask *es7000_target_cpus(void) return cpumask_of(smp_processor_id()); } -static unsigned long -es7000_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid) { return 0; } + static unsigned long es7000_check_apicid_present(int bit) { return physid_isset(bit, phys_cpu_present_map); @@ -539,14 +539,10 @@ static int es7000_cpu_present_to_apicid(int mps_cpu) static int cpu_id; -static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid) +static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap) { - physid_mask_t mask; - - mask = physid_mask_of_physid(cpu_id); + physid_set_mask_of_physid(cpu_id, retmap); ++cpu_id; - - return mask; } /* Mapping from cpu number to logical apicid */ @@ -561,10 +557,10 @@ static int es7000_cpu_to_logical_apicid(int cpu) #endif } -static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map) +static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ - return physids_promote(0xff); + physids_promote(0xFFL, retmap); } static int es7000_check_phys_apicid_present(int cpu_physical_apicid) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3c8f9e75d03..c0b4468683f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -60,8 +60,6 @@ #include <asm/irq_remapping.h> #include <asm/hpet.h> #include <asm/hw_irq.h> -#include <asm/uv/uv_hub.h> -#include <asm/uv/uv_irq.h> #include <asm/apic.h> @@ -96,6 +94,11 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; +/* Number of legacy interrupts */ +static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; +/* GSI interrupts */ +static int nr_irqs_gsi = NR_IRQS_LEGACY; + #if defined (CONFIG_MCA) || defined (CONFIG_EISA) int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif @@ -135,20 +138,6 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) return pin; } -/* - * This is performance-critical, we want to do it O(1) - * - * Most irqs are mapped 1:1 with pins. - */ -struct irq_cfg { - struct irq_pin_list *irq_2_pin; - cpumask_var_t domain; - cpumask_var_t old_domain; - unsigned move_cleanup_count; - u8 vector; - u8 move_in_progress : 1; -}; - /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ #ifdef CONFIG_SPARSE_IRQ static struct irq_cfg irq_cfgx[] = { @@ -173,6 +162,12 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = { [15] = { .vector = IRQ15_VECTOR, }, }; +void __init io_apic_disable_legacy(void) +{ + nr_legacy_irqs = 0; + nr_irqs_gsi = 0; +} + int __init arch_early_irq_init(void) { struct irq_cfg *cfg; @@ -190,7 +185,7 @@ int __init arch_early_irq_init(void) desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); - if (i < NR_IRQS_LEGACY) + if (i < nr_legacy_irqs) cpumask_setall(cfg[i].domain); } @@ -198,7 +193,7 @@ int __init arch_early_irq_init(void) } #ifdef CONFIG_SPARSE_IRQ -static struct irq_cfg *irq_cfg(unsigned int irq) +struct irq_cfg *irq_cfg(unsigned int irq) { struct irq_cfg *cfg = NULL; struct irq_desc *desc; @@ -216,17 +211,14 @@ static struct irq_cfg *get_one_free_irq_cfg(int node) cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); if (cfg) { - if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { + if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { kfree(cfg); cfg = NULL; - } else if (!alloc_cpumask_var_node(&cfg->old_domain, + } else if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_ATOMIC, node)) { free_cpumask_var(cfg->domain); kfree(cfg); cfg = NULL; - } else { - cpumask_clear(cfg->domain); - cpumask_clear(cfg->old_domain); } } @@ -353,7 +345,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) /* end for move_irq_desc */ #else -static struct irq_cfg *irq_cfg(unsigned int irq) +struct irq_cfg *irq_cfg(unsigned int irq) { return irq < nr_irqs ? irq_cfgx + irq : NULL; } @@ -547,23 +539,41 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, add_pin_to_irq_node(cfg, node, newapic, newpin); } +static void __io_apic_modify_irq(struct irq_pin_list *entry, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + unsigned int reg, pin; + + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin * 2); + reg &= mask_and; + reg |= mask_or; + io_apic_modify(entry->apic, 0x10 + pin * 2, reg); + if (final) + final(entry); +} + static void io_apic_modify_irq(struct irq_cfg *cfg, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { - int pin; struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin * 2); - reg &= mask_and; - reg |= mask_or; - io_apic_modify(entry->apic, 0x10 + pin * 2, reg); - if (final) - final(entry); - } + for_each_irq_pin(entry, cfg->irq_2_pin) + __io_apic_modify_irq(entry, mask_and, mask_or, final); +} + +static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry) +{ + __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER, + IO_APIC_REDIR_MASKED, NULL); +} + +static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry) +{ + __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED, + IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) @@ -587,18 +597,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED, NULL); -} - -static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER, NULL); -} - static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { struct irq_cfg *cfg = desc->chip_data; @@ -867,7 +865,7 @@ static int __init find_isa_irq_apic(int irq, int type) */ static int EISA_ELCR(unsigned int irq) { - if (irq < NR_IRQS_LEGACY) { + if (irq < nr_legacy_irqs) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1169,7 +1167,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) int cpu, err; cpumask_var_t tmp_mask; - if ((cfg->move_in_progress) || cfg->move_cleanup_count) + if (cfg->move_in_progress) return -EBUSY; if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) @@ -1229,8 +1227,7 @@ next: return err; } -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { int err; unsigned long flags; @@ -1464,7 +1461,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq } ioapic_register_intr(irq, desc, trigger); - if (irq < NR_IRQS_LEGACY) + if (irq < nr_legacy_irqs) disable_8259A_irq(irq); ioapic_write_entry(apic_id, pin, entry); @@ -1591,9 +1588,6 @@ __apicdebuginit(void) print_IO_APIC(void) struct irq_desc *desc; unsigned int irq; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", @@ -1700,9 +1694,6 @@ __apicdebuginit(void) print_APIC_field(int base) { int i; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG); for (i = 0; i < 8; i++) @@ -1716,9 +1707,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy) unsigned int i, v, ver, maxlvt; u64 icr; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); @@ -1816,13 +1804,19 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk("\n"); } -__apicdebuginit(void) print_all_local_APICs(void) +__apicdebuginit(void) print_local_APICs(int maxcpu) { int cpu; + if (!maxcpu) + return; + preempt_disable(); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + if (cpu >= maxcpu) + break; smp_call_function_single(cpu, print_local_APIC, NULL, 1); + } preempt_enable(); } @@ -1831,7 +1825,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (apic_verbosity == APIC_QUIET) + if (!nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1858,21 +1852,41 @@ __apicdebuginit(void) print_PIC(void) printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); } -__apicdebuginit(int) print_all_ICs(void) +static int __initdata show_lapic = 1; +static __init int setup_show_lapic(char *arg) +{ + int num = -1; + + if (strcmp(arg, "all") == 0) { + show_lapic = CONFIG_NR_CPUS; + } else { + get_option(&arg, &num); + if (num >= 0) + show_lapic = num; + } + + return 1; +} +__setup("show_lapic=", setup_show_lapic); + +__apicdebuginit(int) print_ICs(void) { + if (apic_verbosity == APIC_QUIET) + return 0; + print_PIC(); /* don't print out if apic is not there */ - if (!cpu_has_apic || disable_apic) + if (!cpu_has_apic && !apic_from_smp_config()) return 0; - print_all_local_APICs(); + print_local_APICs(show_lapic); print_IO_APIC(); return 0; } -fs_initcall(print_all_ICs); +fs_initcall(print_ICs); /* Where if anywhere is the i8259 connect in external int mode */ @@ -1894,6 +1908,10 @@ void __init enable_IO_APIC(void) spin_unlock_irqrestore(&ioapic_lock, flags); nr_ioapic_registers[apic] = reg_01.bits.entries+1; } + + if (!nr_legacy_irqs) + return; + for(apic = 0; apic < nr_ioapics; apic++) { int pin; /* See if any of the pins is in ExtINT mode */ @@ -1948,6 +1966,9 @@ void disable_IO_APIC(void) */ clear_IO_APIC(); + if (!nr_legacy_irqs) + return; + /* * If the i8259 is routed through an IOAPIC * Put that IOAPIC in virtual wire mode @@ -1981,7 +2002,7 @@ void disable_IO_APIC(void) /* * Use virtual wire A mode when interrupt remapping is enabled. */ - if (cpu_has_apic) + if (cpu_has_apic || apic_from_smp_config()) disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); } @@ -1994,7 +2015,7 @@ void disable_IO_APIC(void) * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 */ -static void __init setup_ioapic_ids_from_mpc(void) +void __init setup_ioapic_ids_from_mpc(void) { union IO_APIC_reg_00 reg_00; physid_mask_t phys_id_present_map; @@ -2003,9 +2024,8 @@ static void __init setup_ioapic_ids_from_mpc(void) unsigned char old_id; unsigned long flags; - if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) + if (acpi_ioapic) return; - /* * Don't check I/O APIC IDs for xAPIC systems. They have * no meaning without the serial APIC bus. @@ -2017,7 +2037,7 @@ static void __init setup_ioapic_ids_from_mpc(void) * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. */ - phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map); + apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map); /* * Set the IOAPIC ID to the value stored in the MPC table. @@ -2044,7 +2064,7 @@ static void __init setup_ioapic_ids_from_mpc(void) * system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (apic->check_apicid_used(phys_id_present_map, + if (apic->check_apicid_used(&phys_id_present_map, mp_ioapics[apic_id].apicid)) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", apic_id, mp_ioapics[apic_id].apicid); @@ -2059,7 +2079,7 @@ static void __init setup_ioapic_ids_from_mpc(void) mp_ioapics[apic_id].apicid = i; } else { physid_mask_t tmp; - tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid); + apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp); apic_printk(APIC_VERBOSE, "Setting %d in the " "phys_id_present_map\n", mp_ioapics[apic_id].apicid); @@ -2179,7 +2199,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) struct irq_cfg *cfg; spin_lock_irqsave(&ioapic_lock, flags); - if (irq < NR_IRQS_LEGACY) { + if (irq < nr_legacy_irqs) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) was_pending = 1; @@ -2214,20 +2234,16 @@ static int ioapic_retrigger_irq(unsigned int irq) */ #ifdef CONFIG_SMP -static void send_cleanup_vector(struct irq_cfg *cfg) +void send_cleanup_vector(struct irq_cfg *cfg) { cpumask_var_t cleanup_mask; if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { unsigned int i; - cfg->move_cleanup_count = 0; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - cfg->move_cleanup_count++; for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); } else { cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); - cfg->move_cleanup_count = cpumask_weight(cleanup_mask); apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); free_cpumask_var(cleanup_mask); } @@ -2258,15 +2274,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq } } -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); - /* * Either sets desc->affinity to a valid value, and returns * ->cpu_mask_to_apicid of that, or returns BAD_APICID and * leaves desc->affinity untouched. */ -static unsigned int +unsigned int set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; @@ -2419,8 +2432,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) cfg = irq_cfg(irq); spin_lock(&desc->lock); - if (!cfg->move_cleanup_count) - goto unlock; if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; @@ -2438,7 +2449,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) goto unlock; } __get_cpu_var(vector_irq)[vector] = -1; - cfg->move_cleanup_count--; unlock: spin_unlock(&desc->lock); } @@ -2446,21 +2456,33 @@ unlock: irq_exit(); } -static void irq_complete_move(struct irq_desc **descp) +static void __irq_complete_move(struct irq_desc **descp, unsigned vector) { struct irq_desc *desc = *descp; struct irq_cfg *cfg = desc->chip_data; - unsigned vector, me; + unsigned me; if (likely(!cfg->move_in_progress)) return; - vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) send_cleanup_vector(cfg); } + +static void irq_complete_move(struct irq_desc **descp) +{ + __irq_complete_move(descp, ~get_irq_regs()->orig_ax); +} + +void irq_force_complete_move(int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; + + __irq_complete_move(&desc, cfg->vector); +} #else static inline void irq_complete_move(struct irq_desc **descp) {} #endif @@ -2476,6 +2498,59 @@ static void ack_apic_edge(unsigned int irq) atomic_t irq_mis_count; +/* + * IO-APIC versions below 0x20 don't support EOI register. + * For the record, here is the information about various versions: + * 0Xh 82489DX + * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant + * 2Xh I/O(x)APIC which is PCI 2.2 Compliant + * 30h-FFh Reserved + * + * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic + * version as 0x2. This is an error with documentation and these ICH chips + * use io-apic's of version 0x20. + * + * For IO-APIC's with EOI register, we use that to do an explicit EOI. + * Otherwise, we simulate the EOI message manually by changing the trigger + * mode to edge and then back to level, with RTE being masked during this. +*/ +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + struct irq_pin_list *entry; + + for_each_irq_pin(entry, cfg->irq_2_pin) { + if (mp_ioapics[entry->apic].apicver >= 0x20) { + /* + * Intr-remapping uses pin number as the virtual vector + * in the RTE. Actual vector is programmed in + * intr-remapping table entry. Hence for the io-apic + * EOI we use the pin number. + */ + if (irq_remapped(irq)) + io_apic_eoi(entry->apic, entry->pin); + else + io_apic_eoi(entry->apic, cfg->vector); + } else { + __mask_and_edge_IO_APIC_irq(entry); + __unmask_and_level_IO_APIC_irq(entry); + } + } +} + +static void eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2511,6 +2586,19 @@ static void ack_apic_level(unsigned int irq) * level-triggered interrupt. We mask the source for the time of the * operation to prevent an edge-triggered interrupt escaping meanwhile. * The idea is from Manfred Spraul. --macro + * + * Also in the case when cpu goes offline, fixup_irqs() will forward + * any unhandled interrupt on the offlined cpu to the new cpu + * destination that is handling the corresponding interrupt. This + * interrupt forwarding is done via IPI's. Hence, in this case also + * level-triggered io-apic interrupt will be seen as an edge + * interrupt in the IRR. And we can't rely on the cpu's EOI + * to be broadcasted to the IO-APIC's which will clear the remoteIRR + * corresponding to the level-triggered interrupt. Hence on IO-APIC's + * supporting EOI register, we do an explicit EOI to clear the + * remote IRR and on IO-APIC's which don't have an EOI register, + * we use the above logic (mask+edge followed by unmask+level) from + * Manfred Spraul to clear the remote IRR. */ cfg = desc->chip_data; i = cfg->vector; @@ -2522,6 +2610,19 @@ static void ack_apic_level(unsigned int irq) */ ack_APIC_irq(); + /* + * Tail end of clearing remote IRR bit (either by delivering the EOI + * message via io-apic EOI register write or simulating it using + * mask+edge followed by unnask+level logic) manually when the + * level triggered interrupt is seen as the edge triggered interrupt + * at the cpu. + */ + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + + eoi_ioapic_irq(desc); + } + /* Now we can move and renable the irq */ if (unlikely(do_unmask_irq)) { /* Only migrate the irq if the ack has been received. @@ -2555,41 +2656,9 @@ static void ack_apic_level(unsigned int irq) move_masked_irq(irq); unmask_IO_APIC_irq_desc(desc); } - - /* Tail end of version 0x11 I/O APIC bug workaround */ - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(cfg); - __unmask_and_level_IO_APIC_irq(cfg); - spin_unlock(&ioapic_lock); - } } #ifdef CONFIG_INTR_REMAP -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - struct irq_pin_list *entry; - - for_each_irq_pin(entry, cfg->irq_2_pin) - io_apic_eoi(entry->apic, entry->pin); -} - -static void -eoi_ioapic_irq(struct irq_desc *desc) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - static void ir_ack_apic_edge(unsigned int irq) { ack_APIC_irq(); @@ -2657,7 +2726,7 @@ static inline void init_IO_APIC_traps(void) * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < NR_IRQS_LEGACY) + if (irq < nr_legacy_irqs) make_8259A_irq(irq); else /* Strange. Oh, well.. */ @@ -2993,7 +3062,7 @@ out: * the I/O APIC in all cases now. No actual device should request * it anyway. --macro */ -#define PIC_IRQS (1 << PIC_CASCADE_IR) +#define PIC_IRQS (1UL << PIC_CASCADE_IR) void __init setup_IO_APIC(void) { @@ -3001,21 +3070,19 @@ void __init setup_IO_APIC(void) /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ - - io_apic_irqs = ~PIC_IRQS; + io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); /* * Set up IO-APIC IRQ routing. */ -#ifdef CONFIG_X86_32 - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); -#endif + x86_init.mpparse.setup_ioapic_ids(); + sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); - check_timer(); + if (nr_legacy_irqs) + check_timer(); } /* @@ -3116,7 +3183,6 @@ static int __init ioapic_init_sysfs(void) device_initcall(ioapic_init_sysfs); -static int nr_irqs_gsi = NR_IRQS_LEGACY; /* * Dynamic irq allocate and deallocation */ @@ -3146,6 +3212,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) continue; desc_new = move_irq_desc(desc_new, node); + cfg_new = desc_new->chip_data; if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; @@ -3697,75 +3764,6 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) } #endif /* CONFIG_HT_IRQ */ -#ifdef CONFIG_X86_UV -/* - * Re-target the irq to the specified CPU and enable the specified MMR located - * on the specified blade to allow the sending of MSIs to the specified CPU. - */ -int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, - unsigned long mmr_offset) -{ - const struct cpumask *eligible_cpu = cpumask_of(cpu); - struct irq_cfg *cfg; - int mmr_pnode; - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - unsigned long flags; - int err; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - cfg = irq_cfg(irq); - - err = assign_irq_vector(irq, cfg, eligible_cpu); - if (err != 0) - return err; - - spin_lock_irqsave(&vector_lock, flags); - set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, - irq_name); - spin_unlock_irqrestore(&vector_lock, flags); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return irq; -} - -/* - * Disable the specified MMR located on the specified blade so that MSIs are - * longer allowed to be sent. - */ -void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) -{ - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - int mmr_pnode; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->mask = 1; - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); -} -#endif /* CONFIG_X86_64 */ - int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; @@ -3856,7 +3854,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= NR_IRQS_LEGACY) { + if (irq >= nr_legacy_irqs) { cfg = desc->chip_data; if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { printk(KERN_INFO "can not add pin %d for irq %d\n", @@ -3933,7 +3931,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) */ if (physids_empty(apic_id_map)) - apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); + apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); @@ -3949,10 +3947,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) * Every APIC in a system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (apic->check_apicid_used(apic_id_map, apic_id)) { + if (apic->check_apicid_used(&apic_id_map, apic_id)) { for (i = 0; i < get_physical_broadcast(); i++) { - if (!apic->check_apicid_used(apic_id_map, i)) + if (!apic->check_apicid_used(&apic_id_map, i)) break; } @@ -3965,7 +3963,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) apic_id = i; } - tmp = apic->apicid_to_cpu_present(apic_id); + apic->apicid_to_cpu_present(apic_id, &tmp); physids_or(apic_id_map, apic_id_map, tmp); if (reg_00.bits.ID != apic_id) { @@ -4095,7 +4093,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) for (i = 0; i < nr_ioapics; i++) { res[i].name = mem; res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); + snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); mem += IOAPIC_RESOURCE_NAME_SIZE; } @@ -4129,18 +4127,17 @@ void __init ioapic_init_mappings(void) #ifdef CONFIG_X86_32 fake_ioapic_page: #endif - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); + apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), + ioapic_phys); idx++; ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; ioapic_res++; } } diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index db7220220d0..7ff61d6a188 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu) static inline int mce_in_progress(void) { -#if defined(CONFIG_X86_NEW_MCE) +#if defined(CONFIG_X86_MCE) return atomic_read(&mce_entry) > 0; #endif return 0; @@ -508,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) /* * proc handler for /proc/sys/kernel/nmi */ -int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, +int proc_nmi_enabled(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int old_state; nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (!!old_state == !!nmi_watchdog_enabled) return 0; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index ca96e68f0d2..07cdbdcd7a9 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -66,7 +66,6 @@ struct mpc_trans { unsigned short trans_reserved; }; -/* x86_quirks member */ static int mpc_record; static struct mpc_trans *translation_table[MAX_MPC_ENTRY]; @@ -130,10 +129,9 @@ void __cpuinit numaq_tsc_disable(void) } } -static int __init numaq_pre_time_init(void) +static void __init numaq_tsc_init(void) { numaq_tsc_disable(); - return 0; } static inline int generate_logical_apicid(int quad, int phys_apicid) @@ -177,6 +175,19 @@ static void mpc_oem_pci_bus(struct mpc_bus *m) quad_local_to_mp_bus_id[quad][local] = m->busid; } +/* + * Called from mpparse code. + * mode = 0: prescan + * mode = 1: one mpc entry scanned + */ +static void numaq_mpc_record(unsigned int mode) +{ + if (!mode) + mpc_record = 0; + else + mpc_record++; +} + static void __init MP_translation_info(struct mpc_trans *m) { printk(KERN_INFO @@ -206,9 +217,9 @@ static int __init mpf_checksum(unsigned char *mp, int len) /* * Read/parse the MPC oem tables */ -static void __init - smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize) +static void __init smp_read_mpc_oem(struct mpc_table *mpc) { + struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr; int count = sizeof(*oemtable); /* the header size */ unsigned char *oemptr = ((unsigned char *)oemtable) + count; @@ -250,29 +261,6 @@ static void __init } } -static int __init numaq_setup_ioapic_ids(void) -{ - /* so can skip it */ - return 1; -} - -static struct x86_quirks numaq_x86_quirks __initdata = { - .arch_pre_time_init = numaq_pre_time_init, - .arch_time_init = NULL, - .arch_pre_intr_init = NULL, - .arch_memory_setup = NULL, - .arch_intr_init = NULL, - .arch_trap_init = NULL, - .mach_get_smp_config = NULL, - .mach_find_smp_config = NULL, - .mpc_record = &mpc_record, - .mpc_apic_id = mpc_apic_id, - .mpc_oem_bus_info = mpc_oem_bus_info, - .mpc_oem_pci_bus = mpc_oem_pci_bus, - .smp_read_mpc_oem = smp_read_mpc_oem, - .setup_ioapic_ids = numaq_setup_ioapic_ids, -}; - static __init void early_check_numaq(void) { /* @@ -286,8 +274,15 @@ static __init void early_check_numaq(void) if (smp_found_config) early_get_smp_config(); - if (found_numaq) - x86_quirks = &numaq_x86_quirks; + if (found_numaq) { + x86_init.mpparse.mpc_record = numaq_mpc_record; + x86_init.mpparse.setup_ioapic_ids = x86_init_noop; + x86_init.mpparse.mpc_apic_id = mpc_apic_id; + x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; + x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; + x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; + x86_init.timers.tsc_pre_init = numaq_tsc_init; + } } int __init get_memcfg_numaq(void) @@ -339,10 +334,9 @@ static inline const struct cpumask *numaq_target_cpus(void) return cpu_all_mask; } -static inline unsigned long -numaq_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid) { - return physid_isset(apicid, bitmap); + return physid_isset(apicid, *map); } static inline unsigned long numaq_check_apicid_present(int bit) @@ -376,10 +370,10 @@ static inline int numaq_multi_timer_check(int apic, int irq) return apic != 0 && irq == 0; } -static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map) +static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* We don't have a good way to do this yet - hack */ - return physids_promote(0xFUL); + return physids_promote(0xFUL, retmap); } static inline int numaq_cpu_to_logical_apicid(int cpu) @@ -407,18 +401,18 @@ static inline int numaq_apicid_to_node(int logical_apicid) return logical_apicid >> 4; } -static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) +static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) { int node = numaq_apicid_to_node(logical_apicid); int cpu = __ffs(logical_apicid & 0xf); - return physid_mask_of_physid(cpu + 4*node); + physid_set_mask_of_physid(cpu + 4*node, retmap); } /* Where the IO area was mapped on multiquad, always 0 otherwise */ void *xquad_portio; -static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid) +static inline int numaq_check_phys_apicid_present(int phys_apicid) { return 1; } diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 0c0182cc947..1a6559f6768 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -108,7 +108,7 @@ struct apic apic_default = { .apicid_to_node = default_apicid_to_node, .cpu_to_logical_apicid = default_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = default_apicid_to_cpu_present, + .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, .enable_apic_mode = NULL, diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 65edc180fc8..c4cbd3080c1 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -64,16 +64,23 @@ void __init default_setup_apic_routing(void) apic = &apic_x2apic_phys; else apic = &apic_x2apic_cluster; - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } #endif if (apic == &apic_flat) { - if (max_physical_apicid >= 8) - apic = &apic_physflat; - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (num_processors > 8) + apic = &apic_physflat; + break; + case X86_VENDOR_AMD: + if (max_physical_apicid >= 8) + apic = &apic_physflat; + } } + printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); + if (is_vsmp_box()) { /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index eafdfbd1ea9..9b419263d90 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -183,7 +183,7 @@ static const struct cpumask *summit_target_cpus(void) return cpumask_of(0); } -static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid) { return 0; } @@ -261,18 +261,18 @@ static int summit_cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map) +static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ - return physids_promote(0x0F); + physids_promote(0x0FL, retmap); } -static physid_mask_t summit_apicid_to_cpu_present(int apicid) +static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap) { - return physid_mask_of_physid(0); + physid_set_mask_of_physid(0, retmap); } -static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid) +static int summit_check_phys_apicid_present(int physical_apicid) { return 1; } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 601159374e8..130c4b93487 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -352,14 +352,14 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { alias.v = uv_read_local_mmr(redir_addrs[i].alias); - if (alias.s.base == 0) { + if (alias.s.enable && alias.s.base == 0) { *size = (1UL << alias.s.m_alias); redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; return; } } - BUG(); + *base = *size = 0; } enum map_type {map_wb, map_uc}; @@ -389,6 +389,16 @@ static __init void map_gru_high(int max_pnode) map_high("GRU", gru.s.base, shift, max_pnode, map_wb); } +static __init void map_mmr_high(int max_pnode) +{ + union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; + int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; + + mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); + if (mmr.s.enable) + map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); +} + static __init void map_mmioh_high(int max_pnode) { union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; @@ -399,6 +409,12 @@ static __init void map_mmioh_high(int max_pnode) map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); } +static __init void map_low_mmrs(void) +{ + init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); + init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); +} + static __init void uv_rtc_init(void) { long status; @@ -540,6 +556,8 @@ void __init uv_system_init(void) unsigned long mmr_base, present, paddr; unsigned short pnode_mask; + map_low_mmrs(); + m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; @@ -609,12 +627,12 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; - uv_cpu_hub_info(cpu)->n_val = m_val; + uv_cpu_hub_info(cpu)->n_val = n_val; uv_cpu_hub_info(cpu)->numa_blade_id = blade; uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; uv_cpu_hub_info(cpu)->pnode = pnode; uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; - uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; + uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; @@ -643,6 +661,7 @@ void __init uv_system_init(void) } map_gru_high(max_pnode); + map_mmr_high(max_pnode); map_mmioh_high(max_pnode); uv_cpu_init(); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 151ace69a5a..b5b6b23bce5 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -204,7 +204,6 @@ #include <linux/module.h> #include <linux/poll.h> -#include <linux/smp_lock.h> #include <linux/types.h> #include <linux/stddef.h> #include <linux/timer.h> @@ -403,6 +402,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); +static DEFINE_MUTEX(apm_mutex); /* * Set up a segment that references the real mode segment 0x40 @@ -1531,7 +1531,7 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg) return -EPERM; switch (cmd) { case APM_IOC_STANDBY: - lock_kernel(); + mutex_lock(&apm_mutex); if (as->standbys_read > 0) { as->standbys_read--; as->standbys_pending--; @@ -1540,10 +1540,10 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg) queue_event(APM_USER_STANDBY, as); if (standbys_pending <= 0) standby(); - unlock_kernel(); + mutex_unlock(&apm_mutex); break; case APM_IOC_SUSPEND: - lock_kernel(); + mutex_lock(&apm_mutex); if (as->suspends_read > 0) { as->suspends_read--; as->suspends_pending--; @@ -1552,13 +1552,14 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg) queue_event(APM_USER_SUSPEND, as); if (suspends_pending <= 0) { ret = suspend(1); + mutex_unlock(&apm_mutex); } else { as->suspend_wait = 1; + mutex_unlock(&apm_mutex); wait_event_interruptible(apm_suspend_waitqueue, as->suspend_wait == 0); ret = as->suspend_result; } - unlock_kernel(); return ret; default: return -ENOTTY; @@ -1608,12 +1609,10 @@ static int do_open(struct inode *inode, struct file *filp) { struct apm_user *as; - lock_kernel(); as = kmalloc(sizeof(*as), GFP_KERNEL); if (as == NULL) { printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", sizeof(*as)); - unlock_kernel(); return -ENOMEM; } as->magic = APM_BIOS_MAGIC; @@ -1635,7 +1634,6 @@ static int do_open(struct inode *inode, struct file *filp) user_list = as; spin_unlock(&user_list_lock); filp->private_data = as; - unlock_kernel(); return 0; } diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index c1f253dac15..1d2cb383410 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -5,6 +5,7 @@ # Don't trace early stages of a secondary CPU boot ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_common.o = -pg +CFLAGS_REMOVE_perf_event.o = -pg endif # Make sure load_percpu_segment has no stackprotector @@ -13,7 +14,7 @@ CFLAGS_common.o := $(nostackp) obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o -obj-y += vmware.o hypervisor.o +obj-y += vmware.o hypervisor.o sched.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o @@ -27,7 +28,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 22a47c82f3c..c910a716a71 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -184,7 +184,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) * approved Athlon */ WARN_ONCE(1, "WARNING: This combination of AMD" - "processors is not suitable for SMP.\n"); + " processors is not suitable for SMP.\n"); if (!test_taint(TAINT_UNSAFE_SMP)) add_taint(TAINT_UNSAFE_SMP); @@ -333,6 +333,16 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) #endif } +int amd_get_nb_id(int cpu) +{ + int id = 0; +#ifdef CONFIG_SMP + id = per_cpu(cpu_llc_id, cpu); +#endif + return id; +} +EXPORT_SYMBOL_GPL(amd_get_nb_id); + static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2055fc2b2e6..9053be5d95c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,7 +13,7 @@ #include <linux/io.h> #include <asm/stackprotector.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/hypervisor.h> #include <asm/processor.h> @@ -34,7 +34,6 @@ #include <asm/mce.h> #include <asm/msr.h> #include <asm/pat.h> -#include <linux/smp.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/uv/uv.h> @@ -838,10 +837,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } -#ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ - mcheck_init(c); -#endif + mcheck_cpu_init(c); select_idle_routine(c); @@ -870,7 +867,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - init_hw_perf_counters(); + init_hw_perf_events(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 6b2a52dd040..dca325c0399 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -30,8 +30,8 @@ #include <asm/apic.h> #include <asm/desc.h> -static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); -static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); +static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr); +static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr); static DEFINE_PER_CPU(int, cpu_priv_count); static DEFINE_MUTEX(cpu_debug_lock); diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index ae9b503220c..8b581d3905c 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,7 +33,7 @@ #include <linux/cpufreq.h> #include <linux/compiler.h> #include <linux/dmi.h> -#include <trace/power.h> +#include <trace/events/power.h> #include <linux/acpi.h> #include <linux/io.h> @@ -60,7 +60,6 @@ enum { }; #define INTEL_MSR_RANGE (0xffff) -#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1) struct acpi_cpufreq_data { struct acpi_processor_performance *acpi_data; @@ -71,13 +70,7 @@ struct acpi_cpufreq_data { static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); -struct acpi_msr_data { - u64 saved_aperf, saved_mperf; -}; - -static DEFINE_PER_CPU(struct acpi_msr_data, msr_data); - -DEFINE_TRACE(power_mark); +static DEFINE_PER_CPU(struct aperfmperf, old_perf); /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance *acpi_perf_data; @@ -244,23 +237,12 @@ static u32 get_cur_val(const struct cpumask *mask) return cmd.val; } -struct perf_pair { - union { - struct { - u32 lo; - u32 hi; - } split; - u64 whole; - } aperf, mperf; -}; - /* Called via smp_call_function_single(), on the target CPU */ static void read_measured_perf_ctrs(void *_cur) { - struct perf_pair *cur = _cur; + struct aperfmperf *am = _cur; - rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi); - rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi); + get_aperfmperf(am); } /* @@ -279,63 +261,17 @@ static void read_measured_perf_ctrs(void *_cur) static unsigned int get_measured_perf(struct cpufreq_policy *policy, unsigned int cpu) { - struct perf_pair readin, cur; - unsigned int perf_percent; + struct aperfmperf perf; + unsigned long ratio; unsigned int retval; - if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1)) + if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) return 0; - cur.aperf.whole = readin.aperf.whole - - per_cpu(msr_data, cpu).saved_aperf; - cur.mperf.whole = readin.mperf.whole - - per_cpu(msr_data, cpu).saved_mperf; - per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole; - per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole; - -#ifdef __i386__ - /* - * We dont want to do 64 bit divide with 32 bit kernel - * Get an approximate value. Return failure in case we cannot get - * an approximate value. - */ - if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) { - int shift_count; - u32 h; + ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf); + per_cpu(old_perf, cpu) = perf; - h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi); - shift_count = fls(h); - - cur.aperf.whole >>= shift_count; - cur.mperf.whole >>= shift_count; - } - - if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) { - int shift_count = 7; - cur.aperf.split.lo >>= shift_count; - cur.mperf.split.lo >>= shift_count; - } - - if (cur.aperf.split.lo && cur.mperf.split.lo) - perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo; - else - perf_percent = 0; - -#else - if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) { - int shift_count = 7; - cur.aperf.whole >>= shift_count; - cur.mperf.whole >>= shift_count; - } - - if (cur.aperf.whole && cur.mperf.whole) - perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole; - else - perf_percent = 0; - -#endif - - retval = (policy->cpuinfo.max_freq * perf_percent) / 100; + retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; return retval; } @@ -394,7 +330,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int next_perf_state = 0; /* Index into perf table */ unsigned int i; int result = 0; - struct power_trace it; dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); @@ -426,7 +361,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - trace_power_mark(&it, POWER_PSTATE, next_perf_state); + trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: @@ -588,6 +523,27 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = { }, { } }; + +static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) +{ + /* Intel Xeon Processor 7100 Series Specification Update + * http://www.intel.com/Assets/PDF/specupdate/314554.pdf + * AL30: A Machine Check Exception (MCE) Occurring during an + * Enhanced Intel SpeedStep Technology Ratio Change May Cause + * Both Processor Cores to Lock Up. */ + if (c->x86_vendor == X86_VENDOR_INTEL) { + if ((c->x86 == 15) && + (c->x86_model == 6) && + (c->x86_mask == 8)) { + printk(KERN_INFO "acpi-cpufreq: Intel(R) " + "Xeon(R) 7100 Errata AL30, processors may " + "lock up on frequency changes: disabling " + "acpi-cpufreq.\n"); + return -ENODEV; + } + } + return 0; +} #endif static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) @@ -599,9 +555,20 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) unsigned int result = 0; struct cpuinfo_x86 *c = &cpu_data(policy->cpu); struct acpi_processor_performance *perf; +#ifdef CONFIG_SMP + static int blacklisted; +#endif dprintk("acpi_cpufreq_cpu_init\n"); +#ifdef CONFIG_SMP + if (blacklisted) + return blacklisted; + blacklisted = acpi_cpufreq_blacklist(c); + if (blacklisted) + return blacklisted; +#endif + data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); if (!data) return -ENOMEM; @@ -731,12 +698,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) acpi_processor_notify_smm(THIS_MODULE); /* Check for APERF/MPERF support in hardware */ - if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) { - unsigned int ecx; - ecx = cpuid_ecx(6); - if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY) - acpi_cpufreq_driver.getavg = get_measured_perf; - } + if (cpu_has(c, X86_FEATURE_APERFMPERF)) + acpi_cpufreq_driver.getavg = get_measured_perf; dprintk("CPU%u - ACPI performance management activated.\n", cpu); for (i = 0; i < perf->state_count; i++) diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index ce2ed3e4aad..cabd2fa3fc9 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -813,7 +813,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); break; case 1 ... 15: - longhaul_version = TYPE_LONGHAUL_V1; + longhaul_version = TYPE_LONGHAUL_V2; if (c->x86_mask < 8) { cpu_model = CPU_SAMUEL2; cpuname = "C3 'Samuel 2' [C5B]"; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 2a50ef89100..3f12dabeab5 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -605,9 +605,10 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, return 0; } -static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry) +static void invalidate_entry(struct cpufreq_frequency_table *powernow_table, + unsigned int entry) { - data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; + powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; } static void print_basics(struct powernow_k8_data *data) @@ -854,6 +855,10 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) goto err_out; } + /* fill in data */ + data->numps = data->acpi_data.state_count; + powernow_k8_acpi_pst_values(data, 0); + if (cpu_family == CPU_HW_PSTATE) ret_val = fill_powernow_table_pstate(data, powernow_table); else @@ -866,11 +871,8 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) powernow_table[data->acpi_data.state_count].index = 0; data->powernow_table = powernow_table; - /* fill in data */ - data->numps = data->acpi_data.state_count; if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) print_basics(data); - powernow_k8_acpi_pst_values(data, 0); /* notify BIOS that we exist */ acpi_processor_notify_smm(THIS_MODULE); @@ -914,13 +916,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, "bad value %d.\n", i, index); printk(KERN_ERR PFX "Please report to BIOS " "manufacturer\n"); - invalidate_entry(data, i); + invalidate_entry(powernow_table, i); continue; } rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); if (!(hi & HW_PSTATE_VALID_MASK)) { dprintk("invalid pstate %d, ignoring\n", index); - invalidate_entry(data, i); + invalidate_entry(powernow_table, i); continue; } @@ -941,7 +943,6 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table) { int i; - int cntlofreq = 0; for (i = 0; i < data->acpi_data.state_count; i++) { u32 fid; @@ -970,7 +971,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, /* verify frequency is OK */ if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) { dprintk("invalid freq %u kHz, ignoring\n", freq); - invalidate_entry(data, i); + invalidate_entry(powernow_table, i); continue; } @@ -978,38 +979,17 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, * BIOSs are using "off" to indicate invalid */ if (vid == VID_OFF) { dprintk("invalid vid %u, ignoring\n", vid); - invalidate_entry(data, i); + invalidate_entry(powernow_table, i); continue; } - /* verify only 1 entry from the lo frequency table */ - if (fid < HI_FID_TABLE_BOTTOM) { - if (cntlofreq) { - /* if both entries are the same, - * ignore this one ... */ - if ((freq != powernow_table[cntlofreq].frequency) || - (index != powernow_table[cntlofreq].index)) { - printk(KERN_ERR PFX - "Too many lo freq table " - "entries\n"); - return 1; - } - - dprintk("double low frequency table entry, " - "ignoring it.\n"); - invalidate_entry(data, i); - continue; - } else - cntlofreq = i; - } - if (freq != (data->acpi_data.states[i].core_frequency * 1000)) { printk(KERN_INFO PFX "invalid freq entries " "%u kHz vs. %u kHz\n", freq, (unsigned int) (data->acpi_data.states[i].core_frequency * 1000)); - invalidate_entry(data, i); + invalidate_entry(powernow_table, i); continue; } } @@ -1042,7 +1022,7 @@ static int get_transition_latency(struct powernow_k8_data *data) * set it to 1 to avoid problems in the future. * For all others it's a BIOS bug. */ - if (!boot_cpu_data.x86 == 0x11) + if (boot_cpu_data.x86 != 0x11) printk(KERN_ERR FW_WARN PFX "Invalid zero transition " "latency\n"); max_latency = 1; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 6911e91fb4f..3ae5a7a3a50 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -232,28 +232,23 @@ static unsigned int speedstep_detect_chipset(void) return 0; } -struct get_freq_data { - unsigned int speed; - unsigned int processor; -}; - -static void get_freq_data(void *_data) +static void get_freq_data(void *_speed) { - struct get_freq_data *data = _data; + unsigned int *speed = _speed; - data->speed = speedstep_get_frequency(data->processor); + *speed = speedstep_get_frequency(speedstep_processor); } static unsigned int speedstep_get(unsigned int cpu) { - struct get_freq_data data = { .processor = cpu }; + unsigned int speed; /* You're supposed to ensure CPU is online. */ - if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) + if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0) BUG(); - dprintk("detected %u kHz as current frequency\n", data.speed); - return data.speed; + dprintk("detected %u kHz as current frequency\n", speed); + return speed; } /** diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 93ba8eeb100..08be922de33 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -34,13 +34,6 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c) c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; } -unsigned long get_hypervisor_tsc_freq(void) -{ - if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) - return vmware_get_tsc_khz(); - return 0; -} - static inline void __cpuinit hypervisor_set_feature_bits(struct cpuinfo_x86 *c) { @@ -55,3 +48,10 @@ void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) detect_hypervisor_vendor(c); hypervisor_set_feature_bits(c); } + +void __init init_hypervisor_platform(void) +{ + init_hypervisor(&boot_cpu_data); + if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) + vmware_platform_setup(); +} diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 80a722a071b..40e1835b35e 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -350,6 +350,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } + if (c->cpuid_level > 6) { + unsigned ecx = cpuid_ecx(6); + if (ecx & 0x01) + set_cpu_cap(c, X86_FEATURE_APERFMPERF); + } + if (cpu_has_xmm2) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); if (cpu_has_ds) { diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 188a1ca5ad2..4ac6d48fe11 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,11 +1,8 @@ -obj-y = mce.o +obj-y = mce.o mce-severity.o -obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o -obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o -obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c deleted file mode 100644 index b945d5dbc60..00000000000 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Athlon specific Machine Check Exception Reporting - * (C) Copyright 2002 Dave Jones <davej@redhat.com> - */ -#include <linux/interrupt.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> -#include <asm/msr.h> - -/* Machine Check Handler For AMD Athlon/Duron: */ -static void k7_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i = 1; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = '\0'; - addr[0] = '\0'; - - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - - /* Clear it: */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - - -/* AMD K7 machine check is Intel like: */ -void amd_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - if (!cpu_has(c, X86_FEATURE_MCE)) - return; - - machine_check_vector = k7_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - /* - * Clear status for MC index 0 separately, we don't touch CTL, - * as some K7 Athlons cause spurious MCEs when its enabled: - */ - if (boot_cpu_data.x86 == 6) { - wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); - i = 1; - } else - i = 0; - - for (; i < nr_mce_banks; i++) { - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - } - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); -} diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index a3a235a53f0..472763d9209 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -18,7 +18,12 @@ #include <linux/string.h> #include <linux/fs.h> #include <linux/smp.h> +#include <linux/notifier.h> +#include <linux/kdebug.h> +#include <linux/cpu.h> +#include <linux/sched.h> #include <asm/mce.h> +#include <asm/apic.h> /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) @@ -39,44 +44,142 @@ static void inject_mce(struct mce *m) i->finished = 1; } -struct delayed_mce { - struct timer_list timer; - struct mce m; -}; +static void raise_poll(struct mce *m) +{ + unsigned long flags; + mce_banks_t b; -/* Inject mce on current CPU */ -static void raise_mce(unsigned long data) + memset(&b, 0xff, sizeof(mce_banks_t)); + local_irq_save(flags); + machine_check_poll(0, &b); + local_irq_restore(flags); + m->finished = 0; +} + +static void raise_exception(struct mce *m, struct pt_regs *pregs) { - struct delayed_mce *dm = (struct delayed_mce *)data; - struct mce *m = &dm->m; - int cpu = m->extcpu; + struct pt_regs regs; + unsigned long flags; - inject_mce(m); - if (m->status & MCI_STATUS_UC) { - struct pt_regs regs; + if (!pregs) { memset(®s, 0, sizeof(struct pt_regs)); regs.ip = m->ip; regs.cs = m->cs; + pregs = ®s; + } + /* in mcheck exeception handler, irq will be disabled */ + local_irq_save(flags); + do_machine_check(pregs, 0); + local_irq_restore(flags); + m->finished = 0; +} + +static cpumask_t mce_inject_cpumask; + +static int mce_raise_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *)data; + int cpu = smp_processor_id(); + struct mce *m = &__get_cpu_var(injectm); + if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) + return NOTIFY_DONE; + cpu_clear(cpu, mce_inject_cpumask); + if (m->inject_flags & MCJ_EXCEPTION) + raise_exception(m, args->regs); + else if (m->status) + raise_poll(m); + return NOTIFY_STOP; +} + +static struct notifier_block mce_raise_nb = { + .notifier_call = mce_raise_notify, + .priority = 1000, +}; + +/* Inject mce on current CPU */ +static int raise_local(void) +{ + struct mce *m = &__get_cpu_var(injectm); + int context = MCJ_CTX(m->inject_flags); + int ret = 0; + int cpu = m->extcpu; + + if (m->inject_flags & MCJ_EXCEPTION) { printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); - do_machine_check(®s, 0); + switch (context) { + case MCJ_CTX_IRQ: + /* + * Could do more to fake interrupts like + * calling irq_enter, but the necessary + * machinery isn't exported currently. + */ + /*FALL THROUGH*/ + case MCJ_CTX_PROCESS: + raise_exception(m, NULL); + break; + default: + printk(KERN_INFO "Invalid MCE context\n"); + ret = -EINVAL; + } printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); - } else { - mce_banks_t b; - memset(&b, 0xff, sizeof(mce_banks_t)); + } else if (m->status) { printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); - machine_check_poll(0, &b); + raise_poll(m); mce_notify_irq(); - printk(KERN_INFO "Finished machine check poll on CPU %d\n", - cpu); - } - kfree(dm); + printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu); + } else + m->finished = 0; + + return ret; +} + +static void raise_mce(struct mce *m) +{ + int context = MCJ_CTX(m->inject_flags); + + inject_mce(m); + + if (context == MCJ_CTX_RANDOM) + return; + +#ifdef CONFIG_X86_LOCAL_APIC + if (m->inject_flags & MCJ_NMI_BROADCAST) { + unsigned long start; + int cpu; + get_online_cpus(); + mce_inject_cpumask = cpu_online_map; + cpu_clear(get_cpu(), mce_inject_cpumask); + for_each_online_cpu(cpu) { + struct mce *mcpu = &per_cpu(injectm, cpu); + if (!mcpu->finished || + MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) + cpu_clear(cpu, mce_inject_cpumask); + } + if (!cpus_empty(mce_inject_cpumask)) + apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); + start = jiffies; + while (!cpus_empty(mce_inject_cpumask)) { + if (!time_before(jiffies, start + 2*HZ)) { + printk(KERN_ERR + "Timeout waiting for mce inject NMI %lx\n", + *cpus_addr(mce_inject_cpumask)); + break; + } + cpu_relax(); + } + raise_local(); + put_cpu(); + put_online_cpus(); + } else +#endif + raise_local(); } /* Error injection interface */ static ssize_t mce_write(struct file *filp, const char __user *ubuf, size_t usize, loff_t *off) { - struct delayed_mce *dm; struct mce m; if (!capable(CAP_SYS_ADMIN)) @@ -96,19 +199,12 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) return -EINVAL; - dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); - if (!dm) - return -ENOMEM; - /* * Need to give user space some time to set everything up, * so do it a jiffie or two later everywhere. - * Should we use a hrtimer here for better synchronization? */ - memcpy(&dm->m, &m, sizeof(struct mce)); - setup_timer(&dm->timer, raise_mce, (unsigned long)dm); - dm->timer.expires = jiffies + 2; - add_timer_on(&dm->timer, m.extcpu); + schedule_timeout(2); + raise_mce(&m); return usize; } @@ -116,6 +212,7 @@ static int inject_init(void) { printk(KERN_INFO "Machine check injector initialized\n"); mce_chrdev_ops.write = mce_write; + register_die_notifier(&mce_raise_nb); return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 54dcb8ff12e..32996f9fab6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,3 +1,4 @@ +#include <linux/sysdev.h> #include <asm/mce.h> enum severity_level { @@ -10,6 +11,20 @@ enum severity_level { MCE_PANIC_SEVERITY, }; +#define ATTR_LEN 16 + +/* One object for each MCE bank, shared by all CPUs */ +struct mce_bank { + u64 ctl; /* subevents to enable */ + unsigned char init; /* initialise bank? */ + struct sysdev_attribute attr; /* sysdev attribute */ + char attrname[ATTR_LEN]; /* attribute name */ +}; + int mce_severity(struct mce *a, int tolerant, char **msg); +struct dentry *mce_get_debugfs_dir(void); extern int mce_ser; + +extern struct mce_bank *mce_banks; + diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index ff0807f9705..8a85dd1b1aa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -139,6 +139,7 @@ int mce_severity(struct mce *a, int tolerant, char **msg) } } +#ifdef CONFIG_DEBUG_FS static void *s_start(struct seq_file *f, loff_t *pos) { if (*pos >= ARRAY_SIZE(severities)) @@ -197,7 +198,7 @@ static int __init severities_debugfs_init(void) { struct dentry *dmce = NULL, *fseverities_coverage = NULL; - dmce = debugfs_create_dir("mce", NULL); + dmce = mce_get_debugfs_dir(); if (dmce == NULL) goto err_out; fseverities_coverage = debugfs_create_file("severities-coverage", @@ -209,10 +210,7 @@ static int __init severities_debugfs_init(void) return 0; err_out: - if (fseverities_coverage) - debugfs_remove(fseverities_coverage); - if (dmce) - debugfs_remove(dmce); return -ENOMEM; } late_initcall(severities_debugfs_init); +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 01213048f62..0bcaa387586 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -34,6 +34,7 @@ #include <linux/smp.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/debugfs.h> #include <asm/processor.h> #include <asm/hw_irq.h> @@ -45,21 +46,11 @@ #include "mce-internal.h" -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; +#define CREATE_TRACE_POINTS +#include <trace/events/mce.h> int mce_disabled __read_mostly; -#ifdef CONFIG_X86_NEW_MCE - #define MISC_MCELOG_MINOR 227 #define SPINUNIT 100 /* 100ns */ @@ -77,7 +68,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count); */ static int tolerant __read_mostly = 1; static int banks __read_mostly; -static u64 *bank __read_mostly; static int rip_msr __read_mostly; static int mce_bootlog __read_mostly = -1; static int monarch_timeout __read_mostly = -1; @@ -87,28 +77,43 @@ int mce_cmci_disabled __read_mostly; int mce_ignore_ce __read_mostly; int mce_ser __read_mostly; +struct mce_bank *mce_banks __read_mostly; + /* User mode helper program triggered by machine check event */ static unsigned long mce_need_notify; static char mce_helper[128]; static char *mce_helper_argv[2] = { mce_helper, NULL }; -static unsigned long dont_init_banks; - static DECLARE_WAIT_QUEUE_HEAD(mce_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); + +static int default_decode_mce(struct notifier_block *nb, unsigned long val, + void *data) +{ + pr_emerg("No human readable MCE decoding support on this CPU type.\n"); + pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); + + return NOTIFY_STOP; +} + +static struct notifier_block mce_dec_nb = { + .notifier_call = default_decode_mce, + .priority = -1, +}; /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; -static inline int skip_bank_init(int i) -{ - return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); -} - static DEFINE_PER_CPU(struct work_struct, mce_work); /* Do initial initialization of a struct mce */ @@ -147,6 +152,9 @@ void mce_log(struct mce *mce) { unsigned next, entry; + /* Emit the trace record: */ + trace_mce_record(mce); + mce->finished = 0; wmb(); for (;;) { @@ -185,47 +193,58 @@ void mce_log(struct mce *mce) static void print_mce(struct mce *m) { - printk(KERN_EMERG - "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); + if (m->ip) { - printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", - !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->ip); + pr_emerg("RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->ip); + if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->ip); - printk(KERN_CONT "\n"); + pr_cont("\n"); } - printk(KERN_EMERG "TSC %llx ", m->tsc); + + pr_emerg("TSC %llx ", m->tsc); if (m->addr) - printk(KERN_CONT "ADDR %llx ", m->addr); + pr_cont("ADDR %llx ", m->addr); if (m->misc) - printk(KERN_CONT "MISC %llx ", m->misc); - printk(KERN_CONT "\n"); - printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", - m->cpuvendor, m->cpuid, m->time, m->socketid, - m->apicid); + pr_cont("MISC %llx ", m->misc); + + pr_cont("\n"); + pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); + + /* + * Print out human-readable details about the MCE error, + * (if the CPU has an implementation for that) + */ + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); } static void print_mce_head(void) { - printk(KERN_EMERG "\nHARDWARE ERROR\n"); + pr_emerg("\nHARDWARE ERROR\n"); } static void print_mce_tail(void) { - printk(KERN_EMERG "This is not a software problem!\n" - "Run through mcelog --ascii to decode and contact your hardware vendor\n"); + pr_emerg("This is not a software problem!\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ static atomic_t mce_paniced; +static int fake_panic; +static atomic_t mce_fake_paniced; + /* Panic in progress. Enable interrupts and wait for final IPI */ static void wait_for_panic(void) { long timeout = PANIC_TIMEOUT*USEC_PER_SEC; + preempt_disable(); local_irq_enable(); while (timeout-- > 0) @@ -239,15 +258,21 @@ static void mce_panic(char *msg, struct mce *final, char *exp) { int i; - /* - * Make sure only one CPU runs in machine check panic - */ - if (atomic_add_return(1, &mce_paniced) > 1) - wait_for_panic(); - barrier(); + if (!fake_panic) { + /* + * Make sure only one CPU runs in machine check panic + */ + if (atomic_inc_return(&mce_paniced) > 1) + wait_for_panic(); + barrier(); - bust_spinlocks(1); - console_verbose(); + bust_spinlocks(1); + console_verbose(); + } else { + /* Don't log too much for fake panic */ + if (atomic_inc_return(&mce_fake_paniced) > 1) + return; + } print_mce_head(); /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { @@ -274,9 +299,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp) print_mce_tail(); if (exp) printk(KERN_EMERG "Machine check: %s\n", exp); - if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; - panic(msg); + if (!fake_panic) { + if (panic_timeout == 0) + panic_timeout = mce_panic_timeout; + panic(msg); + } else + printk(KERN_EMERG "Fake kernel panic: %s\n", msg); } /* Support code for software error injection */ @@ -284,13 +312,14 @@ static void mce_panic(char *msg, struct mce *final, char *exp) static int msr_to_offset(u32 msr) { unsigned bank = __get_cpu_var(injectm.bank); + if (msr == rip_msr) return offsetof(struct mce, ip); - if (msr == MSR_IA32_MC0_STATUS + bank*4) + if (msr == MSR_IA32_MCx_STATUS(bank)) return offsetof(struct mce, status); - if (msr == MSR_IA32_MC0_ADDR + bank*4) + if (msr == MSR_IA32_MCx_ADDR(bank)) return offsetof(struct mce, addr); - if (msr == MSR_IA32_MC0_MISC + bank*4) + if (msr == MSR_IA32_MCx_MISC(bank)) return offsetof(struct mce, misc); if (msr == MSR_IA32_MCG_STATUS) return offsetof(struct mce, mcgstatus); @@ -301,13 +330,25 @@ static int msr_to_offset(u32 msr) static u64 mce_rdmsrl(u32 msr) { u64 v; + if (__get_cpu_var(injectm).finished) { int offset = msr_to_offset(msr); + if (offset < 0) return 0; return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); } - rdmsrl(msr, v); + + if (rdmsrl_safe(msr, &v)) { + WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); + /* + * Return zero in case the access faulted. This should + * not happen normally but can happen if the CPU does + * something weird, or if the code is buggy. + */ + v = 0; + } + return v; } @@ -315,6 +356,7 @@ static void mce_wrmsrl(u32 msr, u64 v) { if (__get_cpu_var(injectm).finished) { int offset = msr_to_offset(msr); + if (offset >= 0) *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; return; @@ -411,7 +453,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) m->ip = mce_rdmsrl(rip_msr); } -#ifdef CONFIG_X86_LOCAL_APIC +#ifdef CONFIG_X86_LOCAL_APIC /* * Called after interrupts have been reenabled again * when a MCE happened during an interrupts off region @@ -495,7 +537,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); for (i = 0; i < banks; i++) { - if (!bank[i] || !test_bit(i, *b)) + if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; m.misc = 0; @@ -504,7 +546,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.tsc = 0; barrier(); - m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (!(m.status & MCI_STATUS_VAL)) continue; @@ -519,9 +561,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -537,7 +579,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) /* * Clear state for this bank. */ - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } /* @@ -558,7 +600,7 @@ static int mce_no_way_out(struct mce *m, char **msg) int i; for (i = 0; i < banks; i++) { - m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) return 1; } @@ -618,7 +660,7 @@ out: * This way we prevent any potential data corruption in a unrecoverable case * and also makes sure always all CPU's errors are examined. * - * Also this detects the case of an machine check event coming from outer + * Also this detects the case of a machine check event coming from outer * space (not detected by any CPUs) In this case some external agent wants * us to shut down, so panic too. * @@ -671,7 +713,7 @@ static void mce_reign(void) * No machine check event found. Must be some external * source or one CPU is hung. Panic. */ - if (!m && tolerant < 3) + if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) mce_panic("Machine check from unknown source", NULL, NULL); /* @@ -705,7 +747,7 @@ static int mce_start(int *no_way_out) * global_nwo should be updated before mce_callin */ smp_wmb(); - order = atomic_add_return(1, &mce_callin); + order = atomic_inc_return(&mce_callin); /* * Wait for everyone. @@ -842,7 +884,7 @@ static void mce_clear_state(unsigned long *toclear) for (i = 0; i < banks; i++) { if (test_bit(i, toclear)) - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } @@ -895,11 +937,11 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - no_way_out = mce_no_way_out(&m, &msg); - final = &__get_cpu_var(mces_seen); *final = m; + no_way_out = mce_no_way_out(&m, &msg); + barrier(); /* @@ -916,14 +958,14 @@ void do_machine_check(struct pt_regs *regs, long error_code) order = mce_start(&no_way_out); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); - if (!bank[i]) + if (!mce_banks[i].ctl) continue; m.misc = 0; m.addr = 0; m.bank = i; - m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if ((m.status & MCI_STATUS_VAL) == 0) continue; @@ -964,9 +1006,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); /* * Action optional error. Queue address for later processing. @@ -1091,10 +1133,10 @@ void mce_log_therm_throt_event(__u64 status) */ static int check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ +static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mcheck_timer(unsigned long data) +static void mce_start_timer(unsigned long data) { struct timer_list *t = &per_cpu(mce_timer, data); int *n; @@ -1110,7 +1152,7 @@ static void mcheck_timer(unsigned long data) * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(next_interval); + n = &__get_cpu_var(mce_next_interval); if (mce_notify_irq()) *n = max(*n/2, HZ/100); else @@ -1159,10 +1201,26 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); +static int __cpuinit __mcheck_cpu_mce_banks_init(void) +{ + int i; + + mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); + if (!mce_banks) + return -ENOMEM; + for (i = 0; i < banks; i++) { + struct mce_bank *b = &mce_banks[i]; + + b->ctl = -1ULL; + b->init = 1; + } + return 0; +} + /* * Initialize Machine Checks for a CPU. */ -static int mce_cap_init(void) +static int __cpuinit __mcheck_cpu_cap_init(void) { unsigned b; u64 cap; @@ -1170,7 +1228,8 @@ static int mce_cap_init(void) rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + if (!banks) + printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { printk(KERN_WARNING @@ -1182,11 +1241,11 @@ static int mce_cap_init(void) /* Don't support asymmetric configurations today */ WARN_ON(banks != 0 && b != banks); banks = b; - if (!bank) { - bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); - if (!bank) - return -ENOMEM; - memset(bank, 0xff, banks * sizeof(u64)); + if (!mce_banks) { + int err = __mcheck_cpu_mce_banks_init(); + + if (err) + return err; } /* Use accurate RIP reporting if available. */ @@ -1199,7 +1258,7 @@ static int mce_cap_init(void) return 0; } -static void mce_init(void) +static void __mcheck_cpu_init_generic(void) { mce_banks_t all_banks; u64 cap; @@ -1218,15 +1277,17 @@ static void mce_init(void) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - if (skip_bank_init(i)) + struct mce_bank *b = &mce_banks[i]; + + if (!b->init) continue; - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); + wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } /* Add per CPU specific workarounds here */ -static int mce_cpu_quirks(struct cpuinfo_x86 *c) +static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { if (c->x86_vendor == X86_VENDOR_UNKNOWN) { pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); @@ -1241,7 +1302,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c) * trips off incorrectly with the IOMMU & 3ware * & Cerberus: */ - clear_bit(10, (unsigned long *)&bank[4]); + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } if (c->x86 <= 17 && mce_bootlog < 0) { /* @@ -1255,7 +1316,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c) * by default. */ if (c->x86 == 6 && banks > 0) - bank[0] = 0; + mce_banks[0].ctl = 0; } if (c->x86_vendor == X86_VENDOR_INTEL) { @@ -1268,8 +1329,8 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c) * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A) - __set_bit(0, &dont_init_banks); + if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) + mce_banks[0].init = 0; /* * All newer Intel systems support MCE broadcasting. Enable @@ -1294,7 +1355,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c) return 0; } -static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) +static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) return; @@ -1308,7 +1369,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) } } -static void mce_cpu_features(struct cpuinfo_x86 *c) +static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { case X86_VENDOR_INTEL: @@ -1322,10 +1383,10 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) } } -static void mce_init_timer(void) +static void __mcheck_cpu_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(next_interval); + int *n = &__get_cpu_var(mce_next_interval); if (mce_ignore_ce) return; @@ -1333,36 +1394,48 @@ static void mce_init_timer(void) *n = check_interval * HZ; if (!*n) return; - setup_timer(t, mcheck_timer, smp_processor_id()); + setup_timer(t, mce_start_timer, smp_processor_id()); t->expires = round_jiffies(jiffies + *n); add_timer_on(t, smp_processor_id()); } +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: */ -void __cpuinit mcheck_init(struct cpuinfo_x86 *c) +void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) { if (mce_disabled) return; - mce_ancient_init(c); + __mcheck_cpu_ancient_init(c); if (!mce_available(c)) return; - if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { + if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { mce_disabled = 1; return; } machine_check_vector = do_machine_check; - mce_init(); - mce_cpu_features(c); - mce_init_timer(); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(c); + __mcheck_cpu_init_timer(); INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); + } /* @@ -1551,8 +1624,10 @@ static struct miscdevice mce_log_device = { */ static int __init mcheck_enable(char *str) { - if (*str == 0) + if (*str == 0) { enable_p5_mce(); + return 1; + } if (*str == '=') str++; if (!strcmp(str, "off")) @@ -1580,6 +1655,15 @@ static int __init mcheck_enable(char *str) } __setup("mce", mcheck_enable); +int __init mcheck_init(void) +{ + atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); + + mcheck_intel_therm_init(); + + return 0; +} + /* * Sysfs support */ @@ -1588,25 +1672,27 @@ __setup("mce", mcheck_enable); * Disable machine checks on suspend and shutdown. We can't really handle * them later. */ -static int mce_disable(void) +static int mce_disable_error_reporting(void) { int i; for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(MSR_IA32_MCx_CTL(i), 0); } return 0; } static int mce_suspend(struct sys_device *dev, pm_message_t state) { - return mce_disable(); + return mce_disable_error_reporting(); } static int mce_shutdown(struct sys_device *dev) { - return mce_disable(); + return mce_disable_error_reporting(); } /* @@ -1616,8 +1702,8 @@ static int mce_shutdown(struct sys_device *dev) */ static int mce_resume(struct sys_device *dev) { - mce_init(); - mce_cpu_features(¤t_cpu_data); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(¤t_cpu_data); return 0; } @@ -1627,8 +1713,8 @@ static void mce_cpu_restart(void *data) del_timer_sync(&__get_cpu_var(mce_timer)); if (!mce_available(¤t_cpu_data)) return; - mce_init(); - mce_init_timer(); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_timer(); } /* Reinit MCEs after user configuration changes */ @@ -1654,7 +1740,7 @@ static void mce_enable_ce(void *all) cmci_reenable(); cmci_recheck(); if (all) - mce_init_timer(); + __mcheck_cpu_init_timer(); } static struct sysdev_class mce_sysclass = { @@ -1669,14 +1755,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static struct sysdev_attribute *bank_attrs; +static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +{ + return container_of(attr, struct mce_bank, attr); +} static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { - u64 b = bank[attr - bank_attrs]; - - return sprintf(buf, "%llx\n", b); + return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); } static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, @@ -1687,7 +1774,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - bank[attr - bank_attrs] = new; + attr_to_bank(attr)->ctl = new; mce_restart(); return size; @@ -1829,7 +1916,7 @@ static __cpuinit int mce_create_device(unsigned int cpu) } for (j = 0; j < banks; j++) { err = sysdev_create_file(&per_cpu(mce_dev, cpu), - &bank_attrs[j]); + &mce_banks[j].attr); if (err) goto error2; } @@ -1838,10 +1925,10 @@ static __cpuinit int mce_create_device(unsigned int cpu) return 0; error2: while (--j >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); @@ -1859,29 +1946,32 @@ static __cpuinit void mce_remove_device(unsigned int cpu) sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); cpumask_clear_cpu(cpu, mce_dev_initialized); } /* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) +static void __cpuinit mce_disable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(MSR_IA32_MCx_CTL(i), 0); } } -static void mce_reenable_cpu(void *h) +static void __cpuinit mce_reenable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; @@ -1892,8 +1982,10 @@ static void mce_reenable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) - wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); } } @@ -1925,7 +2017,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies(jiffies + - __get_cpu_var(next_interval)); + __get_cpu_var(mce_next_interval)); add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; @@ -1941,38 +2033,24 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; -static __init int mce_init_banks(void) +static __init void mce_init_banks(void) { int i; - bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, - GFP_KERNEL); - if (!bank_attrs) - return -ENOMEM; - for (i = 0; i < banks; i++) { - struct sysdev_attribute *a = &bank_attrs[i]; + struct mce_bank *b = &mce_banks[i]; + struct sysdev_attribute *a = &b->attr; - a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); - if (!a->attr.name) - goto nomem; + a->attr.name = b->attrname; + snprintf(b->attrname, ATTR_LEN, "bank%d", i); a->attr.mode = 0644; a->show = show_bank; a->store = set_bank; } - return 0; - -nomem: - while (--i >= 0) - kfree(bank_attrs[i].attr.name); - kfree(bank_attrs); - bank_attrs = NULL; - - return -ENOMEM; } -static __init int mce_init_device(void) +static __init int mcheck_init_device(void) { int err; int i = 0; @@ -1982,9 +2060,7 @@ static __init int mce_init_device(void) zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); - err = mce_init_banks(); - if (err) - return err; + mce_init_banks(); err = sysdev_class_register(&mce_sysclass); if (err) @@ -2002,59 +2078,67 @@ static __init int mce_init_device(void) return err; } -device_initcall(mce_init_device); - -#else /* CONFIG_X86_OLD_MCE: */ +device_initcall(mcheck_init_device); -int nr_mce_banks; -EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ +/* + * Old style boot options parsing. Only for compatibility. + */ +static int __init mcheck_disable(char *str) +{ + mce_disabled = 1; + return 1; +} +__setup("nomce", mcheck_disable); -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) +#ifdef CONFIG_DEBUG_FS +struct dentry *mce_get_debugfs_dir(void) { - if (mce_disabled) - return; + static struct dentry *dmce; - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - amd_mcheck_init(c); - break; + if (!dmce) + dmce = debugfs_create_dir("mce", NULL); - case X86_VENDOR_INTEL: - if (c->x86 == 5) - intel_p5_mcheck_init(c); - if (c->x86 == 6) - intel_p6_mcheck_init(c); - if (c->x86 == 15) - intel_p4_mcheck_init(c); - break; + return dmce; +} - case X86_VENDOR_CENTAUR: - if (c->x86 == 5) - winchip_mcheck_init(c); - break; +static void mce_reset(void) +{ + cpu_missing = 0; + atomic_set(&mce_fake_paniced, 0); + atomic_set(&mce_executing, 0); + atomic_set(&mce_callin, 0); + atomic_set(&global_nwo, 0); +} - default: - break; - } - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); +static int fake_panic_get(void *data, u64 *val) +{ + *val = fake_panic; + return 0; } -static int __init mcheck_enable(char *str) +static int fake_panic_set(void *data, u64 val) { - mce_p5_enabled = 1; - return 1; + mce_reset(); + fake_panic = val; + return 0; } -__setup("mce", mcheck_enable); -#endif /* CONFIG_X86_OLD_MCE */ +DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, + fake_panic_set, "%llu\n"); -/* - * Old style boot options parsing. Only for compatibility. - */ -static int __init mcheck_disable(char *str) +static int __init mcheck_debugfs_init(void) { - mce_disabled = 1; - return 1; + struct dentry *dmce, *ffake_panic; + + dmce = mce_get_debugfs_dir(); + if (!dmce) + return -ENOMEM; + ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, + &fake_panic_fops); + if (!ffake_panic) + return -ENOMEM; + + return 0; } -__setup("nomce", mcheck_disable); +late_initcall(mcheck_debugfs_init); +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 1fecba404fd..83a3d1f4efc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -69,7 +69,7 @@ struct threshold_bank { struct threshold_block *blocks; cpumask_var_t cpus; }; -static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); +static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); #ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { @@ -489,8 +489,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) int i, err = 0; struct threshold_bank *b = NULL; char name[32]; +#ifdef CONFIG_SMP struct cpuinfo_x86 *c = &cpu_data(cpu); - +#endif sprintf(name, "threshold_bank%i", bank); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index e1acec0f7a3..7c785634af2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -8,6 +8,7 @@ #include <linux/init.h> #include <linux/interrupt.h> #include <linux/percpu.h> +#include <linux/sched.h> #include <asm/apic.h> #include <asm/processor.h> #include <asm/msr.h> @@ -90,7 +91,7 @@ static void cmci_discover(int banks, int boot) if (test_bit(i, owned)) continue; - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ if (val & CMCI_EN) { @@ -101,8 +102,8 @@ static void cmci_discover(int banks, int boot) } val |= CMCI_EN | CMCI_THRESHOLD; - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + wrmsrl(MSR_IA32_MCx_CTL2(i), val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ if (val & CMCI_EN) { @@ -152,9 +153,9 @@ void cmci_clear(void) if (!test_bit(i, __get_cpu_var(mce_banks_owned))) continue; /* Disable CMCI */ - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } spin_unlock_irqrestore(&cmci_discover_lock, flags); diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c deleted file mode 100644 index f5f2d6f71fb..00000000000 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Non Fatal Machine Check Exception Reporting - * - * (C) Copyright 2002 Dave Jones. <davej@redhat.com> - * - * This file contains routines to check for non-fatal MCEs every 15s - * - */ -#include <linux/interrupt.h> -#include <linux/workqueue.h> -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> -#include <asm/msr.h> - -static int firstbank; - -#define MCE_RATE (15*HZ) /* timer rate is 15s */ - -static void mce_checkregs(void *info) -{ - u32 low, high; - int i; - - for (i = firstbank; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - - if (!(high & (1<<31))) - continue; - - printk(KERN_INFO "MCE: The hardware reports a non fatal, " - "correctable incident occurred on CPU %d.\n", - smp_processor_id()); - - printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); - - /* - * Scrub the error so we don't pick it up in MCE_RATE - * seconds time: - */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } -} - -static void mce_work_fn(struct work_struct *work); -static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); - -static void mce_work_fn(struct work_struct *work) -{ - on_each_cpu(mce_checkregs, NULL, 1); - schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); -} - -static int __init init_nonfatal_mce_checker(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) - return -ENODEV; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return -ENODEV; - - /* Some Athlons misbehave when we frob bank 0 */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 6) - firstbank = 1; - else - firstbank = 0; - - /* - * Check for non-fatal errors every MCE_RATE s - */ - schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); - printk(KERN_INFO "Machine check exception polling timer started.\n"); - - return 0; -} -module_init(init_nonfatal_mce_checker); - -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c deleted file mode 100644 index 4482aea9aa2..00000000000 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ /dev/null @@ -1,163 +0,0 @@ -/* - * P4 specific Machine Check Exception Reporting - */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/mce.h> -#include <asm/msr.h> - -/* as supported by the P4/Xeon family */ -struct intel_mce_extended_msrs { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esi; - u32 edi; - u32 ebp; - u32 esp; - u32 eflags; - u32 eip; - /* u32 *reserved[]; */ -}; - -static int mce_num_extended_msrs; - -/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) -{ - u32 h; - - rdmsr(MSR_IA32_MCG_EAX, r->eax, h); - rdmsr(MSR_IA32_MCG_EBX, r->ebx, h); - rdmsr(MSR_IA32_MCG_ECX, r->ecx, h); - rdmsr(MSR_IA32_MCG_EDX, r->edx, h); - rdmsr(MSR_IA32_MCG_ESI, r->esi, h); - rdmsr(MSR_IA32_MCG_EDI, r->edi, h); - rdmsr(MSR_IA32_MCG_EBP, r->ebp, h); - rdmsr(MSR_IA32_MCG_ESP, r->esp, h); - rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h); - rdmsr(MSR_IA32_MCG_EIP, r->eip, h); -} - -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - if (mce_num_extended_msrs > 0) { - struct intel_mce_extended_msrs dbg; - - intel_get_extended_msrs(&dbg); - - printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" - "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" - "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", - smp_processor_id(), dbg.eip, dbg.eflags, - dbg.eax, dbg.ebx, dbg.ecx, dbg.edx, - dbg.esi, dbg.edi, dbg.ebp, dbg.esp); - } - - for (i = 0; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = addr[0] = '\0'; - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error. - */ - for (i = 0; i < nr_mce_banks; i++) { - u32 msr; - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr(msr, low, high); - if (high&(1<<31)) { - /* Clear it */ - wrmsr(msr, 0UL, 0UL); - /* Serialize */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -void intel_p4_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - machine_check_vector = intel_machine_check; - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - for (i = 0; i < nr_mce_banks; i++) { - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - } - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); - - /* Check for P4/Xeon extended MCE MSRs */ - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<9)) {/* MCG_EXT_P */ - mce_num_extended_msrs = (l >> 16) & 0xff; - printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" - " available\n", - smp_processor_id(), mce_num_extended_msrs); - -#ifdef CONFIG_X86_MCE_P4THERMAL - /* Check for P4/Xeon Thermal monitor */ - intel_init_thermal(c); -#endif - } -} diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c deleted file mode 100644 index 01e4f817818..00000000000 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * P6 specific Machine Check Exception Reporting - * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> - */ -#include <linux/interrupt.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> -#include <asm/msr.h> - -/* Machine Check Handler For PII/PIII */ -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i = 0; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = '\0'; - addr[0] = '\0'; - - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error: - */ - for (i = 0; i < nr_mce_banks; i++) { - unsigned int msr; - - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr(msr, low, high); - if (high & (1<<31)) { - /* Clear it: */ - wrmsr(msr, 0UL, 0UL); - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -/* Set up machine check reporting for processors with Intel style MCE: */ -void intel_p6_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) - return; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return; - - /* Ok machine check is available */ - machine_check_vector = intel_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - /* - * Following the example in IA-32 SDM Vol 3: - * - MC0_CTL should not be written - * - Status registers on all banks should be cleared on reset - */ - for (i = 1; i < nr_mce_banks; i++) - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - - for (i = 0; i < nr_mce_banks; i++) - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); -} diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 5957a93e517..4fef985fc22 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -34,20 +34,33 @@ /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) -static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; -static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); -static DEFINE_PER_CPU(bool, thermal_throttle_active); +/* + * Current thermal throttling state: + */ +struct thermal_state { + bool is_throttled; + + u64 next_check; + unsigned long throttle_count; + unsigned long last_throttle_count; +}; + +static DEFINE_PER_CPU(struct thermal_state, thermal_state); + +static atomic_t therm_throt_en = ATOMIC_INIT(0); -static atomic_t therm_throt_en = ATOMIC_INIT(0); +static u32 lvtthmr_init __read_mostly; #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) #define define_therm_throt_sysdev_show_func(name) \ -static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ - struct sysdev_attribute *attr, \ - char *buf) \ + \ +static ssize_t therm_throt_sysdev_show_##name( \ + struct sys_device *dev, \ + struct sysdev_attribute *attr, \ + char *buf) \ { \ unsigned int cpu = dev->id; \ ssize_t ret; \ @@ -55,7 +68,7 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ preempt_disable(); /* CPU hotplug */ \ if (cpu_online(cpu)) \ ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_throttle_##name, cpu)); \ + per_cpu(thermal_state, cpu).name); \ else \ ret = 0; \ preempt_enable(); \ @@ -63,11 +76,11 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ return ret; \ } -define_therm_throt_sysdev_show_func(count); -define_therm_throt_sysdev_one_ro(count); +define_therm_throt_sysdev_show_func(throttle_count); +define_therm_throt_sysdev_one_ro(throttle_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_count.attr, + &attr_throttle_count.attr, NULL }; @@ -93,33 +106,39 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -static int therm_throt_process(int curr) +static int therm_throt_process(bool is_throttled) { - unsigned int cpu = smp_processor_id(); - __u64 tmp_jiffs = get_jiffies_64(); - bool was_throttled = __get_cpu_var(thermal_throttle_active); - bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; + struct thermal_state *state; + unsigned int this_cpu; + bool was_throttled; + u64 now; + + this_cpu = smp_processor_id(); + now = get_jiffies_64(); + state = &per_cpu(thermal_state, this_cpu); + + was_throttled = state->is_throttled; + state->is_throttled = is_throttled; if (is_throttled) - __get_cpu_var(thermal_throttle_count)++; + state->throttle_count++; - if (!(was_throttled ^ is_throttled) && - time_before64(tmp_jiffs, __get_cpu_var(next_check))) + if (time_before64(now, state->next_check) && + state->throttle_count != state->last_throttle_count) return 0; - __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; + state->next_check = now + CHECK_INTERVAL; + state->last_throttle_count = state->throttle_count; /* if we just entered the thermal event */ if (is_throttled) { - printk(KERN_CRIT "CPU%d: Temperature above threshold, " - "cpu clock throttled (total events = %lu)\n", - cpu, __get_cpu_var(thermal_throttle_count)); + printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); add_taint(TAINT_MACHINE_CHECK); return 1; } if (was_throttled) { - printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); + printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); return 1; } @@ -213,7 +232,7 @@ static void intel_thermal_interrupt(void) __u64 msr_val; rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) + if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) mce_log_therm_throt_event(msr_val); } @@ -237,6 +256,18 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); } +void __init mcheck_intel_therm_init(void) +{ + /* + * This function is only called on boot CPU. Save the init thermal + * LVT value on BSP and use that value to restore APs' thermal LVT + * entry BIOS programmed later + */ + if (cpu_has(&boot_cpu_data, X86_FEATURE_ACPI) && + cpu_has(&boot_cpu_data, X86_FEATURE_ACC)) + lvtthmr_init = apic_read(APIC_LVTTHMR); +} + void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); @@ -253,16 +284,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c) * since it might be delivered via SMI already: */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); + + /* + * The initial value of thermal LVT entries on all APs always reads + * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI + * sequence to them and LVT registers are reset to 0s except for + * the mask bits which are set to 1s when APs receive INIT IPI. + * Always restore the value that BIOS has programmed on AP based on + * BSP's info we saved since BIOS is always setting the same value + * for all threads/cores + */ + apic_write(APIC_LVTTHMR, lvtthmr_init); + + h = lvtthmr_init; + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); return; } - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - /* Check whether a vector already exists */ if (h & APIC_VECTOR_MASK) { printk(KERN_DEBUG @@ -271,6 +312,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c) return; } + /* early Pentium M models use different method for enabling TM2 */ + if (cpu_has(c, X86_FEATURE_TM2)) { + if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { + rdmsr(MSR_THERM2_CTL, l, h); + if (l & MSR_THERM2_CTL_TM_SELECT) + tm2 = 1; + } else if (l & MSR_IA32_MISC_ENABLE_TM2) + tm2 = 1; + } + /* We'll mask the thermal vector in the lapic till we're ready: */ h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 315738c74aa..73c86db5acb 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -846,7 +846,7 @@ int __init mtrr_cleanup(unsigned address_bits) sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); range_sums = sum_ranges(range, nr_range); - printk(KERN_INFO "total RAM coverred: %ldM\n", + printk(KERN_INFO "total RAM covered: %ldM\n", range_sums >> (20 - PAGE_SHIFT)); if (mtrr_chunk_size && mtrr_gran_size) { diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 08b6ea4c62b..3c1b12d461d 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -96,17 +96,24 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) unsigned long long base, size; char *ptr; char line[LINE_SIZE]; + int length; size_t linelen; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!len) - return -EINVAL; memset(line, 0, LINE_SIZE); - if (len > LINE_SIZE) - len = LINE_SIZE; - if (copy_from_user(line, buf, len - 1)) + + length = len; + length--; + + if (length > LINE_SIZE - 1) + length = LINE_SIZE - 1; + + if (length < 0) + return -EINVAL; + + if (copy_from_user(line, buf, length)) return -EFAULT; linelen = strlen(line); @@ -126,8 +133,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) return -EINVAL; base = simple_strtoull(line + 5, &ptr, 0); - for (; isspace(*ptr); ++ptr) - ; + while (isspace(*ptr)) + ptr++; if (strncmp(ptr, "size=", 5)) return -EINVAL; @@ -135,14 +142,14 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) size = simple_strtoull(ptr + 5, &ptr, 0); if ((base & 0xfff) || (size & 0xfff)) return -EINVAL; - for (; isspace(*ptr); ++ptr) - ; + while (isspace(*ptr)) + ptr++; if (strncmp(ptr, "type=", 5)) return -EINVAL; ptr += 5; - for (; isspace(*ptr); ++ptr) - ; + while (isspace(*ptr)) + ptr++; for (i = 0; i < MTRR_NUM_TYPES; ++i) { if (strcmp(ptr, mtrr_strings[i])) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7af0f88a416..84e83de5457 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -58,6 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; +static bool mtrr_aps_delayed_init; static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; @@ -163,7 +164,10 @@ static void ipi_handler(void *info) if (data->smp_reg != ~0U) { mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - } else { + } else if (mtrr_aps_delayed_init) { + /* + * Initialize the MTRRs inaddition to the synchronisation. + */ mtrr_if->set_all(); } @@ -265,6 +269,8 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ */ if (reg != ~0U) mtrr_if->set(reg, base, size, type); + else if (!mtrr_aps_delayed_init) + mtrr_if->set_all(); /* Wait for the others */ while (atomic_read(&data.count)) @@ -721,9 +727,7 @@ void __init mtrr_bp_init(void) void mtrr_ap_init(void) { - unsigned long flags; - - if (!mtrr_if || !use_intel()) + if (!use_intel() || mtrr_aps_delayed_init) return; /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries @@ -738,11 +742,7 @@ void mtrr_ap_init(void) * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug * lock to prevent mtrr entry changes */ - local_irq_save(flags); - - mtrr_if->set_all(); - - local_irq_restore(flags); + set_mtrr(~0U, 0, 0, 0); } /** @@ -753,6 +753,34 @@ void mtrr_save_state(void) smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); } +void set_mtrr_aps_delayed_init(void) +{ + if (!use_intel()) + return; + + mtrr_aps_delayed_init = true; +} + +/* + * MTRR initialization for all AP's + */ +void mtrr_aps_init(void) +{ + if (!use_intel()) + return; + + set_mtrr(~0U, 0, 0, 0); + mtrr_aps_delayed_init = false; +} + +void mtrr_bp_restore(void) +{ + if (!use_intel()) + return; + + mtrr_if->set_all(); +} + static int __init mtrr_init_finialize(void) { if (!mtrr_if) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_event.c index f9cd0849bd4..c1bbed1021d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1,5 +1,5 @@ /* - * Performance counter x86 architecture code + * Performance events x86 architecture code * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar @@ -11,7 +11,7 @@ * For licencing details see kernel-base/COPYING */ -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/capability.h> #include <linux/notifier.h> #include <linux/hardirq.h> @@ -27,19 +27,19 @@ #include <asm/stacktrace.h> #include <asm/nmi.h> -static u64 perf_counter_mask __read_mostly; +static u64 perf_event_mask __read_mostly; -/* The maximal number of PEBS counters: */ -#define MAX_PEBS_COUNTERS 4 +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 4 /* The size of a BTS record in bytes: */ #define BTS_RECORD_SIZE 24 /* The size of a per-cpu BTS buffer in bytes: */ -#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024) +#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) /* The BTS overflow threshold in bytes from the end of the buffer: */ -#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64) +#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) /* @@ -65,11 +65,11 @@ struct debug_store { u64 pebs_index; u64 pebs_absolute_maximum; u64 pebs_interrupt_threshold; - u64 pebs_counter_reset[MAX_PEBS_COUNTERS]; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; }; -struct cpu_hw_counters { - struct perf_counter *counters[X86_PMC_IDX_MAX]; +struct cpu_hw_events { + struct perf_event *events[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; @@ -77,6 +77,18 @@ struct cpu_hw_counters { struct debug_store *ds; }; +struct event_constraint { + unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int code; +}; + +#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) } +#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 } + +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->idxmsk[0]; (e)++) + + /* * struct x86_pmu - generic x86 pmu */ @@ -86,30 +98,34 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *); void (*disable_all)(void); void (*enable_all)(void); - void (*enable)(struct hw_perf_counter *, int); - void (*disable)(struct hw_perf_counter *, int); + void (*enable)(struct hw_perf_event *, int); + void (*disable)(struct hw_perf_event *, int); unsigned eventsel; unsigned perfctr; u64 (*event_map)(int); u64 (*raw_event)(u64); int max_events; - int num_counters; - int num_counters_fixed; - int counter_bits; - u64 counter_mask; + int num_events; + int num_events_fixed; + int event_bits; + u64 event_mask; int apic; u64 max_period; u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); + int (*get_event_idx)(struct cpu_hw_events *cpuc, + struct hw_perf_event *hwc); }; static struct x86_pmu x86_pmu __read_mostly; -static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { +static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; +static const struct event_constraint *event_constraints; + /* * Not sure about some of these */ @@ -124,37 +140,47 @@ static const u64 p6_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, }; -static u64 p6_pmu_event_map(int event) +static u64 p6_pmu_event_map(int hw_event) { - return p6_perfmon_event_map[event]; + return p6_perfmon_event_map[hw_event]; } /* - * Counter setting that is specified not to count anything. + * Event setting that is specified not to count anything. * We use this to effectively disable a counter. * * L2_RQSTS with 0 MESI unit mask. */ -#define P6_NOP_COUNTER 0x0000002EULL +#define P6_NOP_EVENT 0x0000002EULL -static u64 p6_pmu_raw_event(u64 event) +static u64 p6_pmu_raw_event(u64 hw_event) { #define P6_EVNTSEL_EVENT_MASK 0x000000FFULL #define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL #define P6_EVNTSEL_EDGE_MASK 0x00040000ULL #define P6_EVNTSEL_INV_MASK 0x00800000ULL -#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL +#define P6_EVNTSEL_REG_MASK 0xFF000000ULL #define P6_EVNTSEL_MASK \ (P6_EVNTSEL_EVENT_MASK | \ P6_EVNTSEL_UNIT_MASK | \ P6_EVNTSEL_EDGE_MASK | \ P6_EVNTSEL_INV_MASK | \ - P6_EVNTSEL_COUNTER_MASK) + P6_EVNTSEL_REG_MASK) - return event & P6_EVNTSEL_MASK; + return hw_event & P6_EVNTSEL_MASK; } +static const struct event_constraint intel_p6_event_constraints[] = +{ + EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ + EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT_END +}; /* * Intel PerfMon v3. Used on Core2 and later. @@ -170,16 +196,45 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; -static u64 intel_pmu_event_map(int event) +static const struct event_constraint intel_core_event_constraints[] = +{ + EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ + EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ + EVENT_CONSTRAINT_END +}; + +static const struct event_constraint intel_nehalem_event_constraints[] = +{ + EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ + EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ + EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ + EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ + EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */ + EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ + EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */ + EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static u64 intel_pmu_event_map(int hw_event) { - return intel_perfmon_event_map[event]; + return intel_perfmon_event_map[hw_event]; } /* - * Generalized hw caching related event table, filled + * Generalized hw caching related hw_event table, filled * in on a per model basis. A value of 0 means - * 'not supported', -1 means 'event makes no sense on - * this CPU', any other value means the raw event + * 'not supported', -1 means 'hw_event makes no sense on + * this CPU', any other value means the raw hw_event * ID. */ @@ -190,7 +245,7 @@ static u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -static const u64 nehalem_hw_cache_event_ids +static __initconst u64 nehalem_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -281,7 +336,7 @@ static const u64 nehalem_hw_cache_event_ids }, }; -static const u64 core2_hw_cache_event_ids +static __initconst u64 core2_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -372,7 +427,7 @@ static const u64 core2_hw_cache_event_ids }, }; -static const u64 atom_hw_cache_event_ids +static __initconst u64 atom_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -463,25 +518,25 @@ static const u64 atom_hw_cache_event_ids }, }; -static u64 intel_pmu_raw_event(u64 event) +static u64 intel_pmu_raw_event(u64 hw_event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL #define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL #define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL +#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ CORE_EVNTSEL_UNIT_MASK | \ CORE_EVNTSEL_EDGE_MASK | \ CORE_EVNTSEL_INV_MASK | \ - CORE_EVNTSEL_COUNTER_MASK) + CORE_EVNTSEL_REG_MASK) - return event & CORE_EVNTSEL_MASK; + return hw_event & CORE_EVNTSEL_MASK; } -static const u64 amd_hw_cache_event_ids +static __initconst u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -585,39 +640,39 @@ static const u64 amd_perfmon_event_map[] = [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, }; -static u64 amd_pmu_event_map(int event) +static u64 amd_pmu_event_map(int hw_event) { - return amd_perfmon_event_map[event]; + return amd_perfmon_event_map[hw_event]; } -static u64 amd_pmu_raw_event(u64 event) +static u64 amd_pmu_raw_event(u64 hw_event) { #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL #define K7_EVNTSEL_EDGE_MASK 0x000040000ULL #define K7_EVNTSEL_INV_MASK 0x000800000ULL -#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL +#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL #define K7_EVNTSEL_MASK \ (K7_EVNTSEL_EVENT_MASK | \ K7_EVNTSEL_UNIT_MASK | \ K7_EVNTSEL_EDGE_MASK | \ K7_EVNTSEL_INV_MASK | \ - K7_EVNTSEL_COUNTER_MASK) + K7_EVNTSEL_REG_MASK) - return event & K7_EVNTSEL_MASK; + return hw_event & K7_EVNTSEL_MASK; } /* - * Propagate counter elapsed time into the generic counter. - * Can only be executed on the CPU where the counter is active. + * Propagate event elapsed time into the generic event. + * Can only be executed on the CPU where the event is active. * Returns the delta events processed. */ static u64 -x86_perf_counter_update(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +x86_perf_event_update(struct perf_event *event, + struct hw_perf_event *hwc, int idx) { - int shift = 64 - x86_pmu.counter_bits; + int shift = 64 - x86_pmu.event_bits; u64 prev_raw_count, new_raw_count; s64 delta; @@ -625,15 +680,15 @@ x86_perf_counter_update(struct perf_counter *counter, return 0; /* - * Careful: an NMI might modify the previous counter value. + * Careful: an NMI might modify the previous event value. * * Our tactic to handle this is to first atomically read and * exchange a new raw count - then add that new-prev delta - * count to the generic counter atomically: + * count to the generic event atomically: */ again: prev_raw_count = atomic64_read(&hwc->prev_count); - rdmsrl(hwc->counter_base + idx, new_raw_count); + rdmsrl(hwc->event_base + idx, new_raw_count); if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) @@ -642,7 +697,7 @@ again: /* * Now we have the new raw value and have updated the prev * timestamp already. We can now calculate the elapsed delta - * (counter-)time and add that to the generic counter. + * (event-)time and add that to the generic event. * * Careful, not all hw sign-extends above the physical width * of the count. @@ -650,13 +705,13 @@ again: delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &counter->count); + atomic64_add(delta, &event->count); atomic64_sub(delta, &hwc->period_left); return new_raw_count; } -static atomic_t active_counters; +static atomic_t active_events; static DEFINE_MUTEX(pmc_reserve_mutex); static bool reserve_pmc_hardware(void) @@ -667,12 +722,12 @@ static bool reserve_pmc_hardware(void) if (nmi_watchdog == NMI_LOCAL_APIC) disable_lapic_nmi_watchdog(); - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < x86_pmu.num_events; i++) { if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) goto perfctr_fail; } - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < x86_pmu.num_events; i++) { if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } @@ -685,7 +740,7 @@ eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu.eventsel + i); - i = x86_pmu.num_counters; + i = x86_pmu.num_events; perfctr_fail: for (i--; i >= 0; i--) @@ -703,7 +758,7 @@ static void release_pmc_hardware(void) #ifdef CONFIG_X86_LOCAL_APIC int i; - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < x86_pmu.num_events; i++) { release_perfctr_nmi(x86_pmu.perfctr + i); release_evntsel_nmi(x86_pmu.eventsel + i); } @@ -720,7 +775,7 @@ static inline bool bts_available(void) static inline void init_debug_store_on_cpu(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; if (!ds) return; @@ -732,7 +787,7 @@ static inline void init_debug_store_on_cpu(int cpu) static inline void fini_debug_store_on_cpu(int cpu) { - if (!per_cpu(cpu_hw_counters, cpu).ds) + if (!per_cpu(cpu_hw_events, cpu).ds) return; wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); @@ -751,12 +806,12 @@ static void release_bts_hardware(void) fini_debug_store_on_cpu(cpu); for_each_possible_cpu(cpu) { - struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; if (!ds) continue; - per_cpu(cpu_hw_counters, cpu).ds = NULL; + per_cpu(cpu_hw_events, cpu).ds = NULL; kfree((void *)(unsigned long)ds->bts_buffer_base); kfree(ds); @@ -796,7 +851,7 @@ static int reserve_bts_hardware(void) ds->bts_interrupt_threshold = ds->bts_absolute_maximum - BTS_OVFL_TH; - per_cpu(cpu_hw_counters, cpu).ds = ds; + per_cpu(cpu_hw_events, cpu).ds = ds; err = 0; } @@ -812,9 +867,9 @@ static int reserve_bts_hardware(void) return err; } -static void hw_perf_counter_destroy(struct perf_counter *counter) +static void hw_perf_event_destroy(struct perf_event *event) { - if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { + if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { release_pmc_hardware(); release_bts_hardware(); mutex_unlock(&pmc_reserve_mutex); @@ -827,7 +882,7 @@ static inline int x86_pmu_initialized(void) } static inline int -set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) +set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) { unsigned int cache_type, cache_op, cache_result; u64 config, val; @@ -880,7 +935,7 @@ static void intel_pmu_enable_bts(u64 config) static void intel_pmu_disable_bts(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); unsigned long debugctlmsr; if (!cpuc->ds) @@ -898,10 +953,10 @@ static void intel_pmu_disable_bts(void) /* * Setup the hardware configuration for a given attr_type */ -static int __hw_perf_counter_init(struct perf_counter *counter) +static int __hw_perf_event_init(struct perf_event *event) { - struct perf_counter_attr *attr = &counter->attr; - struct hw_perf_counter *hwc = &counter->hw; + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; u64 config; int err; @@ -909,27 +964,31 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -ENODEV; err = 0; - if (!atomic_inc_not_zero(&active_counters)) { + if (!atomic_inc_not_zero(&active_events)) { mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_counters) == 0) { + if (atomic_read(&active_events) == 0) { if (!reserve_pmc_hardware()) err = -EBUSY; else err = reserve_bts_hardware(); } if (!err) - atomic_inc(&active_counters); + atomic_inc(&active_events); mutex_unlock(&pmc_reserve_mutex); } if (err) return err; + event->destroy = hw_perf_event_destroy; + /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) */ hwc->config = ARCH_PERFMON_EVENTSEL_INT; + hwc->idx = -1; + /* * Count user and OS events unless requested not to. */ @@ -946,17 +1005,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * If we have a PMU initialized but no APIC * interrupts, we cannot sample hardware - * counters (user-space has to fall back and - * sample via a hrtimer based software counter): + * events (user-space has to fall back and + * sample via a hrtimer based software event): */ if (!x86_pmu.apic) return -EOPNOTSUPP; } - counter->destroy = hw_perf_counter_destroy; - /* - * Raw event type provide the config in the event structure + * Raw hw_event type provide the config in the hw_event structure */ if (attr->type == PERF_TYPE_RAW) { hwc->config |= x86_pmu.raw_event(attr->config); @@ -1001,7 +1058,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) static void p6_pmu_disable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); u64 val; if (!cpuc->enabled) @@ -1018,7 +1075,7 @@ static void p6_pmu_disable_all(void) static void intel_pmu_disable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (!cpuc->enabled) return; @@ -1034,7 +1091,7 @@ static void intel_pmu_disable_all(void) static void amd_pmu_disable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; if (!cpuc->enabled) @@ -1043,12 +1100,12 @@ static void amd_pmu_disable_all(void) cpuc->enabled = 0; /* * ensure we write the disable before we start disabling the - * counters proper, so that amd_pmu_enable_counter() does the + * events proper, so that amd_pmu_enable_event() does the * right thing. */ barrier(); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { u64 val; if (!test_bit(idx, cpuc->active_mask)) @@ -1070,7 +1127,7 @@ void hw_perf_disable(void) static void p6_pmu_enable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); unsigned long val; if (cpuc->enabled) @@ -1087,7 +1144,7 @@ static void p6_pmu_enable_all(void) static void intel_pmu_enable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (cpuc->enabled) return; @@ -1098,19 +1155,19 @@ static void intel_pmu_enable_all(void) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { - struct perf_counter *counter = - cpuc->counters[X86_PMC_IDX_FIXED_BTS]; + struct perf_event *event = + cpuc->events[X86_PMC_IDX_FIXED_BTS]; - if (WARN_ON_ONCE(!counter)) + if (WARN_ON_ONCE(!event)) return; - intel_pmu_enable_bts(counter->hw.config); + intel_pmu_enable_bts(event->hw.config); } } static void amd_pmu_enable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; if (cpuc->enabled) @@ -1119,14 +1176,14 @@ static void amd_pmu_enable_all(void) cpuc->enabled = 1; barrier(); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - struct perf_counter *counter = cpuc->counters[idx]; + for (idx = 0; idx < x86_pmu.num_events; idx++) { + struct perf_event *event = cpuc->events[idx]; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; - val = counter->hw.config; + val = event->hw.config; val |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } @@ -1153,19 +1210,19 @@ static inline void intel_pmu_ack_status(u64 ack) wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) { (void)checking_wrmsrl(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } -static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) { (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); } static inline void -intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) +intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; @@ -1178,10 +1235,10 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) } static inline void -p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val = P6_NOP_COUNTER; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val = P6_NOP_EVENT; if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -1190,7 +1247,7 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) } static inline void -intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) { if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { intel_pmu_disable_bts(); @@ -1202,24 +1259,24 @@ intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) return; } - x86_pmu_disable_counter(hwc, idx); + x86_pmu_disable_event(hwc, idx); } static inline void -amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +amd_pmu_disable_event(struct hw_perf_event *hwc, int idx) { - x86_pmu_disable_counter(hwc, idx); + x86_pmu_disable_event(hwc, idx); } -static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. - * To be called with the counter disabled in hw: + * To be called with the event disabled in hw: */ static int -x86_perf_counter_set_period(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +x86_perf_event_set_period(struct perf_event *event, + struct hw_perf_event *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); s64 period = hwc->sample_period; @@ -1245,7 +1302,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, ret = 1; } /* - * Quirk: certain CPUs dont like it if just 1 event is left: + * Quirk: certain CPUs dont like it if just 1 hw_event is left: */ if (unlikely(left < 2)) left = 2; @@ -1253,24 +1310,24 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (left > x86_pmu.max_period) left = x86_pmu.max_period; - per_cpu(prev_left[idx], smp_processor_id()) = left; + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; /* - * The hw counter starts counting from this counter offset, + * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ atomic64_set(&hwc->prev_count, (u64)-left); - err = checking_wrmsrl(hwc->counter_base + idx, - (u64)(-left) & x86_pmu.counter_mask); + err = checking_wrmsrl(hwc->event_base + idx, + (u64)(-left) & x86_pmu.event_mask); - perf_counter_update_userpage(counter); + perf_event_update_userpage(event); return ret; } static inline void -intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) +intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; @@ -1295,9 +1352,9 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) err = checking_wrmsrl(hwc->config_base, ctrl_val); } -static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); u64 val; val = hwc->config; @@ -1308,10 +1365,10 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) } -static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) { if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - if (!__get_cpu_var(cpu_hw_counters).enabled) + if (!__get_cpu_var(cpu_hw_events).enabled) return; intel_pmu_enable_bts(hwc->config); @@ -1323,134 +1380,189 @@ static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) return; } - x86_pmu_enable_counter(hwc, idx); + x86_pmu_enable_event(hwc, idx); } -static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (cpuc->enabled) - x86_pmu_enable_counter(hwc, idx); + x86_pmu_enable_event(hwc, idx); } -static int -fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) +static int fixed_mode_idx(struct hw_perf_event *hwc) { - unsigned int event; + unsigned int hw_event; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; + hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely((event == + if (unlikely((hw_event == x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && (hwc->sample_period == 1))) return X86_PMC_IDX_FIXED_BTS; - if (!x86_pmu.num_counters_fixed) + if (!x86_pmu.num_events_fixed) return -1; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) + /* + * fixed counters do not take all possible filters + */ + if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK) + return -1; + + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; } /* - * Find a PMC slot for the freshly enabled / scheduled in counter: + * generic counter allocator: get next free counter */ -static int x86_pmu_enable(struct perf_counter *counter) +static int +gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; int idx; - idx = fixed_mode_idx(counter, hwc); + idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events); + return idx == x86_pmu.num_events ? -1 : idx; +} + +/* + * intel-specific counter allocator: check event constraints + */ +static int +intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) +{ + const struct event_constraint *event_constraint; + int i, code; + + if (!event_constraints) + goto skip; + + code = hwc->config & CORE_EVNTSEL_EVENT_MASK; + + for_each_event_constraint(event_constraint, event_constraints) { + if (code == event_constraint->code) { + for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) { + if (!test_and_set_bit(i, cpuc->used_mask)) + return i; + } + return -1; + } + } +skip: + return gen_get_event_idx(cpuc, hwc); +} + +static int +x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) +{ + int idx; + + idx = fixed_mode_idx(hwc); if (idx == X86_PMC_IDX_FIXED_BTS) { /* BTS is already occupied. */ if (test_and_set_bit(idx, cpuc->used_mask)) return -EAGAIN; hwc->config_base = 0; - hwc->counter_base = 0; + hwc->event_base = 0; hwc->idx = idx; } else if (idx >= 0) { /* - * Try to get the fixed counter, if that is already taken - * then try to get a generic counter: + * Try to get the fixed event, if that is already taken + * then try to get a generic event: */ if (test_and_set_bit(idx, cpuc->used_mask)) goto try_generic; hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; /* - * We set it so that counter_base + idx in wrmsr/rdmsr maps to + * We set it so that event_base + idx in wrmsr/rdmsr maps to * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: */ - hwc->counter_base = + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; hwc->idx = idx; } else { idx = hwc->idx; - /* Try to get the previous generic counter again */ - if (test_and_set_bit(idx, cpuc->used_mask)) { + /* Try to get the previous generic event again */ + if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) { try_generic: - idx = find_first_zero_bit(cpuc->used_mask, - x86_pmu.num_counters); - if (idx == x86_pmu.num_counters) + idx = x86_pmu.get_event_idx(cpuc, hwc); + if (idx == -1) return -EAGAIN; set_bit(idx, cpuc->used_mask); hwc->idx = idx; } - hwc->config_base = x86_pmu.eventsel; - hwc->counter_base = x86_pmu.perfctr; + hwc->config_base = x86_pmu.eventsel; + hwc->event_base = x86_pmu.perfctr; } - perf_counters_lapic_init(); + return idx; +} + +/* + * Find a PMC slot for the freshly enabled / scheduled in event: + */ +static int x86_pmu_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx; + + idx = x86_schedule_event(cpuc, hwc); + if (idx < 0) + return idx; + + perf_events_lapic_init(); x86_pmu.disable(hwc, idx); - cpuc->counters[idx] = counter; + cpuc->events[idx] = event; set_bit(idx, cpuc->active_mask); - x86_perf_counter_set_period(counter, hwc, idx); + x86_perf_event_set_period(event, hwc, idx); x86_pmu.enable(hwc, idx); - perf_counter_update_userpage(counter); + perf_event_update_userpage(event); return 0; } -static void x86_pmu_unthrottle(struct perf_counter *counter) +static void x86_pmu_unthrottle(struct perf_event *event) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || - cpuc->counters[hwc->idx] != counter)) + cpuc->events[hwc->idx] != event)) return; x86_pmu.enable(hwc, hwc->idx); } -void perf_counter_print_debug(void) +void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; - struct cpu_hw_counters *cpuc; + struct cpu_hw_events *cpuc; unsigned long flags; int cpu, idx; - if (!x86_pmu.num_counters) + if (!x86_pmu.num_events) return; local_irq_save(flags); cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + cpuc = &per_cpu(cpu_hw_events, cpu); if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); @@ -1466,11 +1578,11 @@ void perf_counter_print_debug(void) } pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); - prev_left = per_cpu(prev_left[idx], cpu); + prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); @@ -1479,7 +1591,7 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", @@ -1488,8 +1600,7 @@ void perf_counter_print_debug(void) local_irq_restore(flags); } -static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, - struct perf_sample_data *data) +static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) { struct debug_store *ds = cpuc->ds; struct bts_record { @@ -1497,11 +1608,14 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, u64 to; u64 flags; }; - struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; - unsigned long orig_ip = data->regs->ip; + struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; struct bts_record *at, *top; + struct perf_output_handle handle; + struct perf_event_header header; + struct perf_sample_data data; + struct pt_regs regs; - if (!counter) + if (!event) return; if (!ds) @@ -1510,26 +1624,45 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; top = (struct bts_record *)(unsigned long)ds->bts_index; + if (top <= at) + return; + ds->bts_index = ds->bts_buffer_base; + + data.period = event->hw.last_period; + data.addr = 0; + regs.ip = 0; + + /* + * Prepare a generic sample, i.e. fill in the invariant fields. + * We will overwrite the from and to address before we output + * the sample. + */ + perf_prepare_sample(&header, &data, event, ®s); + + if (perf_output_begin(&handle, event, + header.size * (top - at), 1, 1)) + return; + for (; at < top; at++) { - data->regs->ip = at->from; - data->addr = at->to; + data.ip = at->from; + data.addr = at->to; - perf_counter_output(counter, 1, data); + perf_output_sample(&handle, &header, &data, event); } - data->regs->ip = orig_ip; - data->addr = 0; + perf_output_end(&handle); /* There's new data available. */ - counter->pending_kill = POLL_IN; + event->hw.interrupts++; + event->pending_kill = POLL_IN; } -static void x86_pmu_disable(struct perf_counter *counter) +static void x86_pmu_disable(struct perf_event *event) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; /* @@ -1541,67 +1674,63 @@ static void x86_pmu_disable(struct perf_counter *counter) /* * Make sure the cleared pointer becomes visible before we - * (potentially) free the counter: + * (potentially) free the event: */ barrier(); /* - * Drain the remaining delta count out of a counter + * Drain the remaining delta count out of a event * that we are disabling: */ - x86_perf_counter_update(counter, hwc, idx); + x86_perf_event_update(event, hwc, idx); /* Drain the remaining BTS records. */ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - struct perf_sample_data data; - struct pt_regs regs; + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) + intel_pmu_drain_bts_buffer(cpuc); - data.regs = ®s; - intel_pmu_drain_bts_buffer(cpuc, &data); - } - cpuc->counters[idx] = NULL; + cpuc->events[idx] = NULL; clear_bit(idx, cpuc->used_mask); - perf_counter_update_userpage(counter); + perf_event_update_userpage(event); } /* - * Save and restart an expired counter. Called by NMI contexts, - * so it has to be careful about preempting normal counter ops: + * Save and restart an expired event. Called by NMI contexts, + * so it has to be careful about preempting normal event ops: */ -static int intel_pmu_save_and_restart(struct perf_counter *counter) +static int intel_pmu_save_and_restart(struct perf_event *event) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; int ret; - x86_perf_counter_update(counter, hwc, idx); - ret = x86_perf_counter_set_period(counter, hwc, idx); + x86_perf_event_update(event, hwc, idx); + ret = x86_perf_event_set_period(event, hwc, idx); - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - intel_pmu_enable_counter(hwc, idx); + if (event->state == PERF_EVENT_STATE_ACTIVE) + intel_pmu_enable_event(hwc, idx); return ret; } static void intel_pmu_reset(void) { - struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds; + struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; unsigned long flags; int idx; - if (!x86_pmu.num_counters) + if (!x86_pmu.num_events) return; local_irq_save(flags); printk("clearing PMU state on CPU#%d\n", smp_processor_id()); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); } if (ds) @@ -1613,39 +1742,38 @@ static void intel_pmu_reset(void) static int p6_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct perf_counter *counter; - struct hw_perf_counter *hwc; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; int idx, handled = 0; u64 val; - data.regs = regs; data.addr = 0; - cpuc = &__get_cpu_var(cpu_hw_counters); + cpuc = &__get_cpu_var(cpu_hw_events); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; - counter = cpuc->counters[idx]; - hwc = &counter->hw; + event = cpuc->events[idx]; + hwc = &event->hw; - val = x86_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << (x86_pmu.counter_bits - 1))) + val = x86_perf_event_update(event, hwc, idx); + if (val & (1ULL << (x86_pmu.event_bits - 1))) continue; /* - * counter overflow + * event overflow */ handled = 1; - data.period = counter->hw.last_period; + data.period = event->hw.last_period; - if (!x86_perf_counter_set_period(counter, hwc, idx)) + if (!x86_perf_event_set_period(event, hwc, idx)) continue; - if (perf_counter_overflow(counter, 1, &data)) - p6_pmu_disable_counter(hwc, idx); + if (perf_event_overflow(event, 1, &data, regs)) + p6_pmu_disable_event(hwc, idx); } if (handled) @@ -1661,17 +1789,16 @@ static int p6_pmu_handle_irq(struct pt_regs *regs) static int intel_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; - struct cpu_hw_counters *cpuc; + struct cpu_hw_events *cpuc; int bit, loops; u64 ack, status; - data.regs = regs; data.addr = 0; - cpuc = &__get_cpu_var(cpu_hw_counters); + cpuc = &__get_cpu_var(cpu_hw_events); perf_disable(); - intel_pmu_drain_bts_buffer(cpuc, &data); + intel_pmu_drain_bts_buffer(cpuc); status = intel_pmu_get_status(); if (!status) { perf_enable(); @@ -1681,8 +1808,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) loops = 0; again: if (++loops > 100) { - WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); - perf_counter_print_debug(); + WARN_ONCE(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); intel_pmu_reset(); perf_enable(); return 1; @@ -1691,19 +1818,19 @@ again: inc_irq_stat(apic_perf_irqs); ack = status; for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { - struct perf_counter *counter = cpuc->counters[bit]; + struct perf_event *event = cpuc->events[bit]; clear_bit(bit, (unsigned long *) &status); if (!test_bit(bit, cpuc->active_mask)) continue; - if (!intel_pmu_save_and_restart(counter)) + if (!intel_pmu_save_and_restart(event)) continue; - data.period = counter->hw.last_period; + data.period = event->hw.last_period; - if (perf_counter_overflow(counter, 1, &data)) - intel_pmu_disable_counter(&counter->hw, bit); + if (perf_event_overflow(event, 1, &data, regs)) + intel_pmu_disable_event(&event->hw, bit); } intel_pmu_ack_status(ack); @@ -1723,39 +1850,38 @@ again: static int amd_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct perf_counter *counter; - struct hw_perf_counter *hwc; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; int idx, handled = 0; u64 val; - data.regs = regs; data.addr = 0; - cpuc = &__get_cpu_var(cpu_hw_counters); + cpuc = &__get_cpu_var(cpu_hw_events); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; - counter = cpuc->counters[idx]; - hwc = &counter->hw; + event = cpuc->events[idx]; + hwc = &event->hw; - val = x86_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << (x86_pmu.counter_bits - 1))) + val = x86_perf_event_update(event, hwc, idx); + if (val & (1ULL << (x86_pmu.event_bits - 1))) continue; /* - * counter overflow + * event overflow */ handled = 1; - data.period = counter->hw.last_period; + data.period = event->hw.last_period; - if (!x86_perf_counter_set_period(counter, hwc, idx)) + if (!x86_perf_event_set_period(event, hwc, idx)) continue; - if (perf_counter_overflow(counter, 1, &data)) - amd_pmu_disable_counter(hwc, idx); + if (perf_event_overflow(event, 1, &data, regs)) + amd_pmu_disable_event(hwc, idx); } if (handled) @@ -1769,18 +1895,21 @@ void smp_perf_pending_interrupt(struct pt_regs *regs) irq_enter(); ack_APIC_irq(); inc_irq_stat(apic_pending_irqs); - perf_counter_do_pending(); + perf_event_do_pending(); irq_exit(); } -void set_perf_counter_pending(void) +void set_perf_event_pending(void) { #ifdef CONFIG_X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) + return; + apic->send_IPI_self(LOCAL_PENDING_VECTOR); #endif } -void perf_counters_lapic_init(void) +void perf_events_lapic_init(void) { #ifdef CONFIG_X86_LOCAL_APIC if (!x86_pmu.apic || !x86_pmu_initialized()) @@ -1794,13 +1923,13 @@ void perf_counters_lapic_init(void) } static int __kprobes -perf_counter_nmi_handler(struct notifier_block *self, +perf_event_nmi_handler(struct notifier_block *self, unsigned long cmd, void *__args) { struct die_args *args = __args; struct pt_regs *regs; - if (!atomic_read(&active_counters)) + if (!atomic_read(&active_events)) return NOTIFY_DONE; switch (cmd) { @@ -1819,7 +1948,7 @@ perf_counter_nmi_handler(struct notifier_block *self, #endif /* * Can't rely on the handled return value to say it was our NMI, two - * counters could trigger 'simultaneously' raising two back-to-back NMIs. + * events could trigger 'simultaneously' raising two back-to-back NMIs. * * If the first NMI handles both, the latter will be empty and daze * the CPU. @@ -1829,19 +1958,19 @@ perf_counter_nmi_handler(struct notifier_block *self, return NOTIFY_STOP; } -static __read_mostly struct notifier_block perf_counter_nmi_notifier = { - .notifier_call = perf_counter_nmi_handler, +static __read_mostly struct notifier_block perf_event_nmi_notifier = { + .notifier_call = perf_event_nmi_handler, .next = NULL, .priority = 1 }; -static struct x86_pmu p6_pmu = { +static __initconst struct x86_pmu p6_pmu = { .name = "p6", .handle_irq = p6_pmu_handle_irq, .disable_all = p6_pmu_disable_all, .enable_all = p6_pmu_enable_all, - .enable = p6_pmu_enable_counter, - .disable = p6_pmu_disable_counter, + .enable = p6_pmu_enable_event, + .disable = p6_pmu_disable_event, .eventsel = MSR_P6_EVNTSEL0, .perfctr = MSR_P6_PERFCTR0, .event_map = p6_pmu_event_map, @@ -1850,25 +1979,26 @@ static struct x86_pmu p6_pmu = { .apic = 1, .max_period = (1ULL << 31) - 1, .version = 0, - .num_counters = 2, + .num_events = 2, /* - * Counters have 40 bits implemented. However they are designed such + * Events have 40 bits implemented. However they are designed such * that bits [32-39] are sign extensions of bit 31. As such the - * effective width of a counter for P6-like PMU is 32 bits only. + * effective width of a event for P6-like PMU is 32 bits only. * * See IA-32 Intel Architecture Software developer manual Vol 3B */ - .counter_bits = 32, - .counter_mask = (1ULL << 32) - 1, + .event_bits = 32, + .event_mask = (1ULL << 32) - 1, + .get_event_idx = intel_get_event_idx, }; -static struct x86_pmu intel_pmu = { +static __initconst struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, .disable_all = intel_pmu_disable_all, .enable_all = intel_pmu_enable_all, - .enable = intel_pmu_enable_counter, - .disable = intel_pmu_disable_counter, + .enable = intel_pmu_enable_event, + .disable = intel_pmu_disable_event, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, .event_map = intel_pmu_event_map, @@ -1878,34 +2008,36 @@ static struct x86_pmu intel_pmu = { /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of - * the generic counter period: + * the generic event period: */ .max_period = (1ULL << 31) - 1, .enable_bts = intel_pmu_enable_bts, .disable_bts = intel_pmu_disable_bts, + .get_event_idx = intel_get_event_idx, }; -static struct x86_pmu amd_pmu = { +static __initconst struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, .enable_all = amd_pmu_enable_all, - .enable = amd_pmu_enable_counter, - .disable = amd_pmu_disable_counter, + .enable = amd_pmu_enable_event, + .disable = amd_pmu_disable_event, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, .event_map = amd_pmu_event_map, .raw_event = amd_pmu_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = 4, - .counter_bits = 48, - .counter_mask = (1ULL << 48) - 1, + .num_events = 4, + .event_bits = 48, + .event_mask = (1ULL << 48) - 1, .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, + .get_event_idx = gen_get_event_idx, }; -static int p6_pmu_init(void) +static __init int p6_pmu_init(void) { switch (boot_cpu_data.x86_model) { case 1: @@ -1915,10 +2047,12 @@ static int p6_pmu_init(void) case 7: case 8: case 11: /* Pentium III */ + event_constraints = intel_p6_event_constraints; break; case 9: case 13: /* Pentium M */ + event_constraints = intel_p6_event_constraints; break; default: pr_cont("unsupported p6 CPU model %d ", @@ -1937,7 +2071,7 @@ static int p6_pmu_init(void) return 0; } -static int intel_pmu_init(void) +static __init int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; @@ -1956,7 +2090,7 @@ static int intel_pmu_init(void) /* * Check whether the Architectural PerfMon supports - * Branch Misses Retired Event or not. + * Branch Misses Retired hw_event or not. */ cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) @@ -1968,15 +2102,15 @@ static int intel_pmu_init(void) x86_pmu = intel_pmu; x86_pmu.version = version; - x86_pmu.num_counters = eax.split.num_counters; - x86_pmu.counter_bits = eax.split.bit_width; - x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.num_events = eax.split.num_events; + x86_pmu.event_bits = eax.split.bit_width; + x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; /* - * Quirk: v2 perfmon does not report fixed-purpose counters, so - * assume at least 3 counters: + * Quirk: v2 perfmon does not report fixed-purpose events, so + * assume at least 3 events: */ - x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); + x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); /* * Install the hw-cache-events table: @@ -1990,12 +2124,14 @@ static int intel_pmu_init(void) sizeof(hw_cache_event_ids)); pr_cont("Core2 events, "); + event_constraints = intel_core_event_constraints; break; default: case 26: memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + event_constraints = intel_nehalem_event_constraints; pr_cont("Nehalem/Corei7 events, "); break; case 28: @@ -2008,7 +2144,7 @@ static int intel_pmu_init(void) return 0; } -static int amd_pmu_init(void) +static __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) @@ -2023,11 +2159,11 @@ static int amd_pmu_init(void) return 0; } -void __init init_hw_perf_counters(void) +void __init init_hw_perf_events(void) { int err; - pr_info("Performance Counters: "); + pr_info("Performance Events: "); switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: @@ -2040,45 +2176,45 @@ void __init init_hw_perf_counters(void) return; } if (err != 0) { - pr_cont("no PMU driver, software counters only.\n"); + pr_cont("no PMU driver, software events only.\n"); return; } pr_cont("%s PMU driver.\n", x86_pmu.name); - if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - x86_pmu.num_counters, X86_PMC_MAX_GENERIC); - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; + if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + x86_pmu.num_events, X86_PMC_MAX_GENERIC); + x86_pmu.num_events = X86_PMC_MAX_GENERIC; } - perf_counter_mask = (1 << x86_pmu.num_counters) - 1; - perf_max_counters = x86_pmu.num_counters; + perf_event_mask = (1 << x86_pmu.num_events) - 1; + perf_max_events = x86_pmu.num_events; - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; + if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; } - perf_counter_mask |= - ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; - x86_pmu.intel_ctrl = perf_counter_mask; + perf_event_mask |= + ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; + x86_pmu.intel_ctrl = perf_event_mask; - perf_counters_lapic_init(); - register_die_notifier(&perf_counter_nmi_notifier); + perf_events_lapic_init(); + register_die_notifier(&perf_event_nmi_notifier); - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.counter_bits); - pr_info("... generic counters: %d\n", x86_pmu.num_counters); - pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); - pr_info("... counter mask: %016Lx\n", perf_counter_mask); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.event_bits); + pr_info("... generic registers: %d\n", x86_pmu.num_events); + pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); + pr_info("... event mask: %016Lx\n", perf_event_mask); } -static inline void x86_pmu_read(struct perf_counter *counter) +static inline void x86_pmu_read(struct perf_event *event) { - x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); + x86_perf_event_update(event, &event->hw, event->hw.idx); } static const struct pmu pmu = { @@ -2088,13 +2224,52 @@ static const struct pmu pmu = { .unthrottle = x86_pmu_unthrottle, }; -const struct pmu *hw_perf_counter_init(struct perf_counter *counter) +static int +validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event fake_event = event->hw; + + if (event->pmu && event->pmu != &pmu) + return 0; + + return x86_schedule_event(cpuc, &fake_event) >= 0; +} + +static int validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct cpu_hw_events fake_pmu; + + memset(&fake_pmu, 0, sizeof(fake_pmu)); + + if (!validate_event(&fake_pmu, leader)) + return -ENOSPC; + + list_for_each_entry(sibling, &leader->sibling_list, group_entry) { + if (!validate_event(&fake_pmu, sibling)) + return -ENOSPC; + } + + if (!validate_event(&fake_pmu, event)) + return -ENOSPC; + + return 0; +} + +const struct pmu *hw_perf_event_init(struct perf_event *event) { int err; - err = __hw_perf_counter_init(counter); - if (err) + err = __hw_perf_event_init(event); + if (!err) { + if (event->group_leader != event) + err = validate_group(event); + } + if (err) { + if (event->destroy) + event->destroy(event); return ERR_PTR(err); + } return &pmu; } @@ -2110,8 +2285,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) entry->ip[entry->nr++] = ip; } -static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); static DEFINE_PER_CPU(int, in_nmi_frame); @@ -2264,9 +2439,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) struct perf_callchain_entry *entry; if (in_nmi()) - entry = &__get_cpu_var(nmi_entry); + entry = &__get_cpu_var(pmc_nmi_entry); else - entry = &__get_cpu_var(irq_entry); + entry = &__get_cpu_var(pmc_irq_entry); entry->nr = 0; @@ -2275,7 +2450,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } -void hw_perf_counter_setup_online(int cpu) +void hw_perf_event_setup_online(int cpu) { init_debug_store_on_cpu(cpu); } diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 392bea43b89..898df9719af 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -20,7 +20,7 @@ #include <linux/kprobes.h> #include <asm/apic.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h> struct nmi_watchdog_ctlblk { unsigned int cccr_msr; @@ -712,7 +712,7 @@ static void probe_nmi_watchdog(void) switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && - boot_cpu_data.x86 != 16) + boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17) return; wd_ops = &k7_wd_ops; break; diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c new file mode 100644 index 00000000000..a640ae5ad20 --- /dev/null +++ b/arch/x86/kernel/cpu/sched.c @@ -0,0 +1,55 @@ +#include <linux/sched.h> +#include <linux/math64.h> +#include <linux/percpu.h> +#include <linux/irqflags.h> + +#include <asm/cpufeature.h> +#include <asm/processor.h> + +#ifdef CONFIG_SMP + +static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched); + +static unsigned long scale_aperfmperf(void) +{ + struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched); + unsigned long ratio, flags; + + local_irq_save(flags); + get_aperfmperf(&val); + local_irq_restore(flags); + + ratio = calc_aperfmperf_ratio(old, &val); + *old = val; + + return ratio; +} + +unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + /* + * do aperf/mperf on the cpu level because it includes things + * like turbo mode, which are relevant to full cores. + */ + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + return scale_aperfmperf(); + + /* + * maybe have something cpufreq here + */ + + return default_scale_freq_power(sd, cpu); +} + +unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + /* + * aperf/mperf already includes the smt gain + */ + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + return SCHED_LOAD_SCALE; + + return default_scale_smt_power(sd, cpu); +} + +#endif diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index bc24f514ec9..1cbed97b59c 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -24,6 +24,7 @@ #include <linux/dmi.h> #include <asm/div64.h> #include <asm/vmware.h> +#include <asm/x86_init.h> #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -47,21 +48,35 @@ static inline int __vmware_platform(void) return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; } -static unsigned long __vmware_get_tsc_khz(void) +static unsigned long vmware_get_tsc_khz(void) { uint64_t tsc_hz; uint32_t eax, ebx, ecx, edx; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); - if (ebx == UINT_MAX) - return 0; tsc_hz = eax | (((uint64_t)ebx) << 32); do_div(tsc_hz, 1000); BUG_ON(tsc_hz >> 32); + printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", + (unsigned long) tsc_hz / 1000, + (unsigned long) tsc_hz % 1000); return tsc_hz; } +void __init vmware_platform_setup(void) +{ + uint32_t eax, ebx, ecx, edx; + + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + + if (ebx != UINT_MAX) + x86_platform.calibrate_tsc = vmware_get_tsc_khz; + else + printk(KERN_WARNING + "Failed to get TSC freq from the hypervisor\n"); +} + /* * While checking the dmi string infomation, just checking the product * serial key should be enough, as this will always have a VMware @@ -87,12 +102,6 @@ int vmware_platform(void) return 0; } -unsigned long vmware_get_tsc_khz(void) -{ - BUG_ON(!vmware_platform()); - return __vmware_get_tsc_khz(); -} - /* * VMware hypervisor takes care of exporting a reliable TSC to the guest. * Still, due to timing difference when running on virtual cpus, the TSC can diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 48e8e6558b2..7ef24a79699 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier = .notifier_call = cpuid_class_cpu_callback, }; -static char *cpuid_nodename(struct device *dev) +static char *cpuid_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); } @@ -198,7 +198,7 @@ static int __init cpuid_init(void) err = PTR_ERR(cpuid_class); goto out_chrdev; } - cpuid_class->nodename = cpuid_nodename; + cpuid_class->devnode = cpuid_devnode; for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 5e409dc298a..a4849c10a77 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -27,8 +27,7 @@ #include <asm/cpu.h> #include <asm/reboot.h> #include <asm/virtext.h> -#include <asm/iommu.h> - +#include <asm/x86_init.h> #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) @@ -106,7 +105,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #endif #ifdef CONFIG_X86_64 - pci_iommu_shutdown(); + x86_platform.iommu_shutdown(); #endif crash_save_cpu(regs, safe_smp_processor_id()); diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index f7cdb3b457a..cd97ce18c29 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -16,6 +16,22 @@ static void *kdump_buf_page; /* Stores the physical address of elf header of crash image. */ unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; +static inline bool is_crashed_pfn_valid(unsigned long pfn) +{ +#ifndef CONFIG_X86_PAE + /* + * non-PAE kdump kernel executed from a PAE one will crop high pte + * bits and poke unwanted space counting again from address 0, we + * don't want that. pte must fit into unsigned long. In fact the + * test checks high 12 bits for being zero (pfn will be shifted left + * by PAGE_SHIFT). + */ + return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn; +#else + return true; +#endif +} + /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied @@ -41,6 +57,9 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!csize) return 0; + if (!is_crashed_pfn_valid(pfn)) + return -EFAULT; + vaddr = kmap_atomic_pfn(pfn, KM_PTE0); if (!userbuf) { diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 2d8a371d433..b8ce165dde5 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -268,11 +268,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) show_registers(regs); #ifdef CONFIG_X86_32 - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { + if (user_mode_vm(regs)) { sp = regs->sp; ss = regs->ss & 0xffff; + } else { + sp = kernel_stack_pointer(regs); + savesegment(ss, ss); } printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); print_symbol("%s", regs->ip); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index bca5fba91c9..f7dd2a7c3bf 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -5,7 +5,6 @@ #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/uaccess.h> -#include <linux/utsname.h> #include <linux/hardirq.h> #include <linux/kdebug.h> #include <linux/module.h> diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 54b0a327676..a071e6be177 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -5,7 +5,6 @@ #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/uaccess.h> -#include <linux/utsname.h> #include <linux/hardirq.h> #include <linux/kdebug.h> #include <linux/module.h> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 147005a1cc3..d17d482a04f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1331,7 +1331,7 @@ void __init e820_reserve_resources(void) struct resource *res; u64 end; - res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); + res = alloc_bootmem(sizeof(struct resource) * e820.nr_map); e820_res = res; for (i = 0; i < e820.nr_map; i++) { end = e820.map[i].addr + e820.map[i].size - 1; @@ -1378,8 +1378,8 @@ static unsigned long ram_alignment(resource_size_t pos) if (mb < 16) return 1024*1024; - /* To 32MB for anything above that */ - return 32*1024*1024; + /* To 64MB for anything above that */ + return 64*1024*1024; } #define MAX_RESOURCE_SIZE ((resource_size_t)-1) @@ -1455,28 +1455,11 @@ char *__init default_machine_specific_memory_setup(void) return who; } -char *__init __attribute__((weak)) machine_specific_memory_setup(void) -{ - if (x86_quirks->arch_memory_setup) { - char *who = x86_quirks->arch_memory_setup(); - - if (who) - return who; - } - return default_machine_specific_memory_setup(); -} - -/* Overridden in paravirt.c if CONFIG_PARAVIRT */ -char * __init __attribute__((weak)) memory_setup(void) -{ - return machine_specific_memory_setup(); -} - void __init setup_memory_map(void) { char *who; - who = memory_setup(); + who = x86_init.resources.memory_setup(); memcpy(&e820_saved, &e820, sizeof(struct e820map)); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); e820_print_map(who); diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 335f049d110..b9c830c12b4 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -160,721 +160,6 @@ static struct console early_serial_console = { .index = -1, }; -#ifdef CONFIG_EARLY_PRINTK_DBGP - -static struct ehci_caps __iomem *ehci_caps; -static struct ehci_regs __iomem *ehci_regs; -static struct ehci_dbg_port __iomem *ehci_debug; -static unsigned int dbgp_endpoint_out; - -struct ehci_dev { - u32 bus; - u32 slot; - u32 func; -}; - -static struct ehci_dev ehci_dev; - -#define USB_DEBUG_DEVNUM 127 - -#define DBGP_DATA_TOGGLE 0x8800 - -static inline u32 dbgp_pid_update(u32 x, u32 tok) -{ - return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff); -} - -static inline u32 dbgp_len_update(u32 x, u32 len) -{ - return (x & ~0x0f) | (len & 0x0f); -} - -/* - * USB Packet IDs (PIDs) - */ - -/* token */ -#define USB_PID_OUT 0xe1 -#define USB_PID_IN 0x69 -#define USB_PID_SOF 0xa5 -#define USB_PID_SETUP 0x2d -/* handshake */ -#define USB_PID_ACK 0xd2 -#define USB_PID_NAK 0x5a -#define USB_PID_STALL 0x1e -#define USB_PID_NYET 0x96 -/* data */ -#define USB_PID_DATA0 0xc3 -#define USB_PID_DATA1 0x4b -#define USB_PID_DATA2 0x87 -#define USB_PID_MDATA 0x0f -/* Special */ -#define USB_PID_PREAMBLE 0x3c -#define USB_PID_ERR 0x3c -#define USB_PID_SPLIT 0x78 -#define USB_PID_PING 0xb4 -#define USB_PID_UNDEF_0 0xf0 - -#define USB_PID_DATA_TOGGLE 0x88 -#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) - -#define PCI_CAP_ID_EHCI_DEBUG 0xa - -#define HUB_ROOT_RESET_TIME 50 /* times are in msec */ -#define HUB_SHORT_RESET_TIME 10 -#define HUB_LONG_RESET_TIME 200 -#define HUB_RESET_TIMEOUT 500 - -#define DBGP_MAX_PACKET 8 - -static int dbgp_wait_until_complete(void) -{ - u32 ctrl; - int loop = 0x100000; - - do { - ctrl = readl(&ehci_debug->control); - /* Stop when the transaction is finished */ - if (ctrl & DBGP_DONE) - break; - } while (--loop > 0); - - if (!loop) - return -1; - - /* - * Now that we have observed the completed transaction, - * clear the done bit. - */ - writel(ctrl | DBGP_DONE, &ehci_debug->control); - return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); -} - -static void __init dbgp_mdelay(int ms) -{ - int i; - - while (ms--) { - for (i = 0; i < 1000; i++) - outb(0x1, 0x80); - } -} - -static void dbgp_breath(void) -{ - /* Sleep to give the debug port a chance to breathe */ -} - -static int dbgp_wait_until_done(unsigned ctrl) -{ - u32 pids, lpid; - int ret; - int loop = 3; - -retry: - writel(ctrl | DBGP_GO, &ehci_debug->control); - ret = dbgp_wait_until_complete(); - pids = readl(&ehci_debug->pids); - lpid = DBGP_PID_GET(pids); - - if (ret < 0) - return ret; - - /* - * If the port is getting full or it has dropped data - * start pacing ourselves, not necessary but it's friendly. - */ - if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET)) - dbgp_breath(); - - /* If I get a NACK reissue the transmission */ - if (lpid == USB_PID_NAK) { - if (--loop > 0) - goto retry; - } - - return ret; -} - -static void dbgp_set_data(const void *buf, int size) -{ - const unsigned char *bytes = buf; - u32 lo, hi; - int i; - - lo = hi = 0; - for (i = 0; i < 4 && i < size; i++) - lo |= bytes[i] << (8*i); - for (; i < 8 && i < size; i++) - hi |= bytes[i] << (8*(i - 4)); - writel(lo, &ehci_debug->data03); - writel(hi, &ehci_debug->data47); -} - -static void __init dbgp_get_data(void *buf, int size) -{ - unsigned char *bytes = buf; - u32 lo, hi; - int i; - - lo = readl(&ehci_debug->data03); - hi = readl(&ehci_debug->data47); - for (i = 0; i < 4 && i < size; i++) - bytes[i] = (lo >> (8*i)) & 0xff; - for (; i < 8 && i < size; i++) - bytes[i] = (hi >> (8*(i - 4))) & 0xff; -} - -static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, - const char *bytes, int size) -{ - u32 pids, addr, ctrl; - int ret; - - if (size > DBGP_MAX_PACKET) - return -1; - - addr = DBGP_EPADDR(devnum, endpoint); - - pids = readl(&ehci_debug->pids); - pids = dbgp_pid_update(pids, USB_PID_OUT); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, size); - ctrl |= DBGP_OUT; - ctrl |= DBGP_GO; - - dbgp_set_data(bytes, size); - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - return ret; -} - -static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, - int size) -{ - u32 pids, addr, ctrl; - int ret; - - if (size > DBGP_MAX_PACKET) - return -1; - - addr = DBGP_EPADDR(devnum, endpoint); - - pids = readl(&ehci_debug->pids); - pids = dbgp_pid_update(pids, USB_PID_IN); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, size); - ctrl &= ~DBGP_OUT; - ctrl |= DBGP_GO; - - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - if (size > ret) - size = ret; - dbgp_get_data(data, size); - return ret; -} - -static int __init dbgp_control_msg(unsigned devnum, int requesttype, - int request, int value, int index, void *data, int size) -{ - u32 pids, addr, ctrl; - struct usb_ctrlrequest req; - int read; - int ret; - - read = (requesttype & USB_DIR_IN) != 0; - if (size > (read ? DBGP_MAX_PACKET:0)) - return -1; - - /* Compute the control message */ - req.bRequestType = requesttype; - req.bRequest = request; - req.wValue = cpu_to_le16(value); - req.wIndex = cpu_to_le16(index); - req.wLength = cpu_to_le16(size); - - pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); - addr = DBGP_EPADDR(devnum, 0); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, sizeof(req)); - ctrl |= DBGP_OUT; - ctrl |= DBGP_GO; - - /* Send the setup message */ - dbgp_set_data(&req, sizeof(req)); - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - /* Read the result */ - return dbgp_bulk_read(devnum, 0, data, size); -} - - -/* Find a PCI capability */ -static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap) -{ - u8 pos; - int bytes; - - if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & - PCI_STATUS_CAP_LIST)) - return 0; - - pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); - for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { - u8 id; - - pos &= ~3; - id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); - if (id == 0xff) - break; - if (id == cap) - return pos; - - pos = read_pci_config_byte(num, slot, func, - pos+PCI_CAP_LIST_NEXT); - } - return 0; -} - -static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func) -{ - u32 class; - - class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); - if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI) - return 0; - - return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); -} - -static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) -{ - u32 bus, slot, func; - - for (bus = 0; bus < 256; bus++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - unsigned cap; - - cap = __find_dbgp(bus, slot, func); - - if (!cap) - continue; - if (ehci_num-- != 0) - continue; - *rbus = bus; - *rslot = slot; - *rfunc = func; - return cap; - } - } - } - return 0; -} - -static int __init ehci_reset_port(int port) -{ - u32 portsc; - u32 delay_time, delay; - int loop; - - /* Reset the usb debug port */ - portsc = readl(&ehci_regs->port_status[port - 1]); - portsc &= ~PORT_PE; - portsc |= PORT_RESET; - writel(portsc, &ehci_regs->port_status[port - 1]); - - delay = HUB_ROOT_RESET_TIME; - for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; - delay_time += delay) { - dbgp_mdelay(delay); - - portsc = readl(&ehci_regs->port_status[port - 1]); - if (portsc & PORT_RESET) { - /* force reset to complete */ - loop = 2; - writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), - &ehci_regs->port_status[port - 1]); - do { - portsc = readl(&ehci_regs->port_status[port-1]); - } while ((portsc & PORT_RESET) && (--loop > 0)); - } - - /* Device went away? */ - if (!(portsc & PORT_CONNECT)) - return -ENOTCONN; - - /* bomb out completely if something weird happend */ - if ((portsc & PORT_CSC)) - return -EINVAL; - - /* If we've finished resetting, then break out of the loop */ - if (!(portsc & PORT_RESET) && (portsc & PORT_PE)) - return 0; - } - return -EBUSY; -} - -static int __init ehci_wait_for_port(int port) -{ - u32 status; - int ret, reps; - - for (reps = 0; reps < 3; reps++) { - dbgp_mdelay(100); - status = readl(&ehci_regs->status); - if (status & STS_PCD) { - ret = ehci_reset_port(port); - if (ret == 0) - return 0; - } - } - return -ENOTCONN; -} - -#ifdef DBGP_DEBUG -# define dbgp_printk early_printk -#else -static inline void dbgp_printk(const char *fmt, ...) { } -#endif - -typedef void (*set_debug_port_t)(int port); - -static void __init default_set_debug_port(int port) -{ -} - -static set_debug_port_t __initdata set_debug_port = default_set_debug_port; - -static void __init nvidia_set_debug_port(int port) -{ - u32 dword; - dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, - 0x74); - dword &= ~(0x0f<<12); - dword |= ((port & 0x0f)<<12); - write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74, - dword); - dbgp_printk("set debug port to %d\n", port); -} - -static void __init detect_set_debug_port(void) -{ - u32 vendorid; - - vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, - 0x00); - - if ((vendorid & 0xffff) == 0x10de) { - dbgp_printk("using nvidia set_debug_port\n"); - set_debug_port = nvidia_set_debug_port; - } -} - -static int __init ehci_setup(void) -{ - struct usb_debug_descriptor dbgp_desc; - u32 cmd, ctrl, status, portsc, hcs_params; - u32 debug_port, new_debug_port = 0, n_ports; - u32 devnum; - int ret, i; - int loop; - int port_map_tried; - int playtimes = 3; - -try_next_time: - port_map_tried = 0; - -try_next_port: - - hcs_params = readl(&ehci_caps->hcs_params); - debug_port = HCS_DEBUG_PORT(hcs_params); - n_ports = HCS_N_PORTS(hcs_params); - - dbgp_printk("debug_port: %d\n", debug_port); - dbgp_printk("n_ports: %d\n", n_ports); - - for (i = 1; i <= n_ports; i++) { - portsc = readl(&ehci_regs->port_status[i-1]); - dbgp_printk("portstatus%d: %08x\n", i, portsc); - } - - if (port_map_tried && (new_debug_port != debug_port)) { - if (--playtimes) { - set_debug_port(new_debug_port); - goto try_next_time; - } - return -1; - } - - loop = 10; - /* Reset the EHCI controller */ - cmd = readl(&ehci_regs->command); - cmd |= CMD_RESET; - writel(cmd, &ehci_regs->command); - do { - cmd = readl(&ehci_regs->command); - } while ((cmd & CMD_RESET) && (--loop > 0)); - - if (!loop) { - dbgp_printk("can not reset ehci\n"); - return -1; - } - dbgp_printk("ehci reset done\n"); - - /* Claim ownership, but do not enable yet */ - ctrl = readl(&ehci_debug->control); - ctrl |= DBGP_OWNER; - ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); - writel(ctrl, &ehci_debug->control); - - /* Start the ehci running */ - cmd = readl(&ehci_regs->command); - cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); - cmd |= CMD_RUN; - writel(cmd, &ehci_regs->command); - - /* Ensure everything is routed to the EHCI */ - writel(FLAG_CF, &ehci_regs->configured_flag); - - /* Wait until the controller is no longer halted */ - loop = 10; - do { - status = readl(&ehci_regs->status); - } while ((status & STS_HALT) && (--loop > 0)); - - if (!loop) { - dbgp_printk("ehci can be started\n"); - return -1; - } - dbgp_printk("ehci started\n"); - - /* Wait for a device to show up in the debug port */ - ret = ehci_wait_for_port(debug_port); - if (ret < 0) { - dbgp_printk("No device found in debug port\n"); - goto next_debug_port; - } - dbgp_printk("ehci wait for port done\n"); - - /* Enable the debug port */ - ctrl = readl(&ehci_debug->control); - ctrl |= DBGP_CLAIM; - writel(ctrl, &ehci_debug->control); - ctrl = readl(&ehci_debug->control); - if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) { - dbgp_printk("No device in debug port\n"); - writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control); - goto err; - } - dbgp_printk("debug ported enabled\n"); - - /* Completely transfer the debug device to the debug controller */ - portsc = readl(&ehci_regs->port_status[debug_port - 1]); - portsc &= ~PORT_PE; - writel(portsc, &ehci_regs->port_status[debug_port - 1]); - - dbgp_mdelay(100); - - /* Find the debug device and make it device number 127 */ - for (devnum = 0; devnum <= 127; devnum++) { - ret = dbgp_control_msg(devnum, - USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, - &dbgp_desc, sizeof(dbgp_desc)); - if (ret > 0) - break; - } - if (devnum > 127) { - dbgp_printk("Could not find attached debug device\n"); - goto err; - } - if (ret < 0) { - dbgp_printk("Attached device is not a debug device\n"); - goto err; - } - dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint; - - /* Move the device to 127 if it isn't already there */ - if (devnum != USB_DEBUG_DEVNUM) { - ret = dbgp_control_msg(devnum, - USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); - if (ret < 0) { - dbgp_printk("Could not move attached device to %d\n", - USB_DEBUG_DEVNUM); - goto err; - } - devnum = USB_DEBUG_DEVNUM; - dbgp_printk("debug device renamed to 127\n"); - } - - /* Enable the debug interface */ - ret = dbgp_control_msg(USB_DEBUG_DEVNUM, - USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); - if (ret < 0) { - dbgp_printk(" Could not enable the debug device\n"); - goto err; - } - dbgp_printk("debug interface enabled\n"); - - /* Perform a small write to get the even/odd data state in sync - */ - ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1); - if (ret < 0) { - dbgp_printk("dbgp_bulk_write failed: %d\n", ret); - goto err; - } - dbgp_printk("small write doned\n"); - - return 0; -err: - /* Things didn't work so remove my claim */ - ctrl = readl(&ehci_debug->control); - ctrl &= ~(DBGP_CLAIM | DBGP_OUT); - writel(ctrl, &ehci_debug->control); - return -1; - -next_debug_port: - port_map_tried |= (1<<(debug_port - 1)); - new_debug_port = ((debug_port-1+1)%n_ports) + 1; - if (port_map_tried != ((1<<n_ports) - 1)) { - set_debug_port(new_debug_port); - goto try_next_port; - } - if (--playtimes) { - set_debug_port(new_debug_port); - goto try_next_time; - } - - return -1; -} - -static int __init early_dbgp_init(char *s) -{ - u32 debug_port, bar, offset; - u32 bus, slot, func, cap; - void __iomem *ehci_bar; - u32 dbgp_num; - u32 bar_val; - char *e; - int ret; - u8 byte; - - if (!early_pci_allowed()) - return -1; - - dbgp_num = 0; - if (*s) - dbgp_num = simple_strtoul(s, &e, 10); - dbgp_printk("dbgp_num: %d\n", dbgp_num); - - cap = find_dbgp(dbgp_num, &bus, &slot, &func); - if (!cap) - return -1; - - dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot, - func); - - debug_port = read_pci_config(bus, slot, func, cap); - bar = (debug_port >> 29) & 0x7; - bar = (bar * 4) + 0xc; - offset = (debug_port >> 16) & 0xfff; - dbgp_printk("bar: %02x offset: %03x\n", bar, offset); - if (bar != PCI_BASE_ADDRESS_0) { - dbgp_printk("only debug ports on bar 1 handled.\n"); - - return -1; - } - - bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); - dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset); - if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) { - dbgp_printk("only simple 32bit mmio bars supported\n"); - - return -1; - } - - /* double check if the mem space is enabled */ - byte = read_pci_config_byte(bus, slot, func, 0x04); - if (!(byte & 0x2)) { - byte |= 0x02; - write_pci_config_byte(bus, slot, func, 0x04, byte); - dbgp_printk("mmio for ehci enabled\n"); - } - - /* - * FIXME I don't have the bar size so just guess PAGE_SIZE is more - * than enough. 1K is the biggest I have seen. - */ - set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK); - ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE); - ehci_bar += bar_val & ~PAGE_MASK; - dbgp_printk("ehci_bar: %p\n", ehci_bar); - - ehci_caps = ehci_bar; - ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase)); - ehci_debug = ehci_bar + offset; - ehci_dev.bus = bus; - ehci_dev.slot = slot; - ehci_dev.func = func; - - detect_set_debug_port(); - - ret = ehci_setup(); - if (ret < 0) { - dbgp_printk("ehci_setup failed\n"); - ehci_debug = NULL; - - return -1; - } - - return 0; -} - -static void early_dbgp_write(struct console *con, const char *str, u32 n) -{ - int chunk, ret; - - if (!ehci_debug) - return; - while (n > 0) { - chunk = n; - if (chunk > DBGP_MAX_PACKET) - chunk = DBGP_MAX_PACKET; - ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, - dbgp_endpoint_out, str, chunk); - str += chunk; - n -= chunk; - } -} - -static struct console early_dbgp_console = { - .name = "earlydbg", - .write = early_dbgp_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; -#endif - /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; static int __initdata early_console_initialized; @@ -891,10 +176,24 @@ asmlinkage void early_printk(const char *fmt, ...) va_end(ap); } +static inline void early_console_register(struct console *con, int keep_early) +{ + if (early_console->index != -1) { + printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n", + con->name); + return; + } + early_console = con; + if (keep_early) + early_console->flags &= ~CON_BOOT; + else + early_console->flags |= CON_BOOT; + register_console(early_console); +} static int __init setup_early_printk(char *buf) { - int keep_early; + int keep; if (!buf) return 0; @@ -903,42 +202,37 @@ static int __init setup_early_printk(char *buf) return 0; early_console_initialized = 1; - keep_early = (strstr(buf, "keep") != NULL); - - if (!strncmp(buf, "serial", 6)) { - early_serial_init(buf + 6); - early_console = &early_serial_console; - } else if (!strncmp(buf, "ttyS", 4)) { - early_serial_init(buf); - early_console = &early_serial_console; - } else if (!strncmp(buf, "vga", 3) - && boot_params.screen_info.orig_video_isVGA == 1) { - max_xpos = boot_params.screen_info.orig_video_cols; - max_ypos = boot_params.screen_info.orig_video_lines; - current_ypos = boot_params.screen_info.orig_y; - early_console = &early_vga_console; + keep = (strstr(buf, "keep") != NULL); + + while (*buf != '\0') { + if (!strncmp(buf, "serial", 6)) { + buf += 6; + early_serial_init(buf); + early_console_register(&early_serial_console, keep); + if (!strncmp(buf, ",ttyS", 5)) + buf += 5; + } + if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf + 4); + early_console_register(&early_serial_console, keep); + } + if (!strncmp(buf, "vga", 3) && + boot_params.screen_info.orig_video_isVGA == 1) { + max_xpos = boot_params.screen_info.orig_video_cols; + max_ypos = boot_params.screen_info.orig_video_lines; + current_ypos = boot_params.screen_info.orig_y; + early_console_register(&early_vga_console, keep); + } #ifdef CONFIG_EARLY_PRINTK_DBGP - } else if (!strncmp(buf, "dbgp", 4)) { - if (early_dbgp_init(buf+4) < 0) - return 0; - early_console = &early_dbgp_console; - /* - * usb subsys will reset ehci controller, so don't keep - * that early console - */ - keep_early = 0; + if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4)) + early_console_register(&early_dbgp_console, keep); #endif #ifdef CONFIG_HVC_XEN - } else if (!strncmp(buf, "xen", 3)) { - early_console = &xenboot_console; + if (!strncmp(buf, "xen", 3)) + early_console_register(&xenboot_console, keep); #endif + buf++; } - - if (keep_early) - early_console->flags &= ~CON_BOOT; - else - early_console->flags |= CON_BOOT; - register_console(early_console); return 0; } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index fe26ba3e345..cdcfb122f25 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -42,6 +42,7 @@ #include <asm/time.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include <asm/x86_init.h> #define EFI_DEBUG 1 #define PFX "EFI: " @@ -453,6 +454,11 @@ void __init efi_init(void) if (add_efi_memmap) do_add_efi_memmap(); +#ifdef CONFIG_X86_32 + x86_platform.get_wallclock = efi_get_time; + x86_platform.set_wallclock = efi_set_rtc_mmss; +#endif + /* Setup for EFI runtime service */ reboot_type = BOOT_EFI; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c097e7d607c..50b9c220e12 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -334,6 +334,10 @@ ENTRY(ret_from_fork) END(ret_from_fork) /* + * Interrupt exit functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" +/* * Return to user mode is not as complex as all this looks, * but we want the default path for a system call return to * go as quickly as possible which is why some of this is @@ -383,6 +387,10 @@ need_resched: END(resume_kernel) #endif CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ @@ -513,6 +521,10 @@ sysexit_audit: PTGS_TO_GS_EX ENDPROC(ia32_sysenter_target) +/* + * syscall stub including irq exit should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway @@ -705,6 +717,10 @@ syscall_badsys: jmp resume_userspace END(syscall_badsys) CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection /* * System calls that need a pt_regs pointer. @@ -814,6 +830,10 @@ common_interrupt: ENDPROC(common_interrupt) CFI_ENDPROC +/* + * Irq entries should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ @@ -980,6 +1000,10 @@ ENTRY(spurious_interrupt_bug) jmp error_code CFI_ENDPROC END(spurious_interrupt_bug) +/* + * End of kprobes section + */ + .popsection ENTRY(kernel_thread_helper) pushl $0 # fake return address for unwinder @@ -1185,17 +1209,14 @@ END(ftrace_graph_caller) .globl return_to_handler return_to_handler: - pushl $0 pushl %eax - pushl %ecx pushl %edx movl %ebp, %eax call ftrace_return_to_handler - movl %eax, 0xc(%esp) + movl %eax, %ecx popl %edx - popl %ecx popl %eax - ret + jmp *%ecx #endif .section .rodata,"a" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c251be74510..4deb8fc849d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -146,7 +146,7 @@ ENTRY(ftrace_graph_caller) END(ftrace_graph_caller) GLOBAL(return_to_handler) - subq $80, %rsp + subq $24, %rsp /* Save the return values */ movq %rax, (%rsp) @@ -155,11 +155,11 @@ GLOBAL(return_to_handler) call ftrace_return_to_handler - movq %rax, 72(%rsp) + movq %rax, %rdi movq 8(%rsp), %rdx movq (%rsp), %rax - addq $72, %rsp - retq + addq $24, %rsp + jmp *%rdi #endif @@ -536,20 +536,13 @@ sysret_signal: bt $TIF_SYSCALL_AUDIT,%edx jc sysret_audit #endif - /* edx: work flags (arg3) */ - leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 - xorl %esi,%esi # oldset -> arg2 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 - call do_notify_resume - RESTORE_TOP_OF_STACK %r11 - RESTORE_REST - movl $_TIF_WORK_MASK,%edi - /* Use IRET because user could have changed frame. This - works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check + /* + * We have a signal, or exit tracing or single-step. + * These all wind up with the iret return path anyway, + * so just join that path right now. + */ + FIXUP_TOP_OF_STACK %r11, -ARGOFFSET + jmp int_check_syscall_exit_work badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) @@ -654,6 +647,7 @@ int_careful: int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) +int_check_syscall_exit_work: SAVE_REST /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx @@ -809,6 +803,10 @@ END(interrupt) call \func .endm +/* + * Interrupt entry/exit should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" /* * The interrupt stubs push (~vector+0x80) onto the stack and * then jump to common_interrupt. @@ -947,6 +945,10 @@ ENTRY(retint_kernel) CFI_ENDPROC END(common_interrupt) +/* + * End of kprobes section + */ + .popsection /* * APIC interrupts. @@ -1021,7 +1023,7 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS apicinterrupt LOCAL_PENDING_VECTOR \ perf_pending_interrupt smp_perf_pending_interrupt #endif @@ -1497,12 +1499,17 @@ error_kernelspace: leaq irq_return(%rip),%rcx cmpq %rcx,RIP+8(%rsp) je error_swapgs - movl %ecx,%ecx /* zero extend */ - cmpq %rcx,RIP+8(%rsp) - je error_swapgs + movl %ecx,%eax /* zero extend */ + cmpq %rax,RIP+8(%rsp) + je bstep_iret cmpq $gs_change,RIP+8(%rsp) je error_swapgs jmp error_sti + +bstep_iret: + /* Fix truncated RIP */ + movq %rcx,RIP+8(%rsp) + jmp error_swapgs END(error_entry) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 9dbb527e165..5a1b9758fd6 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -9,6 +9,8 @@ * the dangers of modifying code on the run. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/spinlock.h> #include <linux/hardirq.h> #include <linux/uaccess.h> @@ -336,15 +338,15 @@ int __init ftrace_dyn_arch_init(void *data) switch (faulted) { case 0: - pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); + pr_info("converting mcount calls to 0f 1f 44 00 00\n"); memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); break; case 1: - pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); + pr_info("converting mcount calls to 66 66 66 66 90\n"); memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); break; case 2: - pr_info("ftrace: converting mcount calls to jmp . + 5\n"); + pr_info("converting mcount calls to jmp . + 5\n"); memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); break; } @@ -468,82 +470,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, #ifdef CONFIG_FTRACE_SYSCALLS -extern unsigned long __start_syscalls_metadata[]; -extern unsigned long __stop_syscalls_metadata[]; extern unsigned long *sys_call_table; -static struct syscall_metadata **syscalls_metadata; - -static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) -{ - struct syscall_metadata *start; - struct syscall_metadata *stop; - char str[KSYM_SYMBOL_LEN]; - - - start = (struct syscall_metadata *)__start_syscalls_metadata; - stop = (struct syscall_metadata *)__stop_syscalls_metadata; - kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str); - - for ( ; start < stop; start++) { - if (start->name && !strcmp(start->name, str)) - return start; - } - return NULL; -} - -struct syscall_metadata *syscall_nr_to_meta(int nr) -{ - if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) - return NULL; - - return syscalls_metadata[nr]; -} - -int syscall_name_to_nr(char *name) +unsigned long __init arch_syscall_addr(int nr) { - int i; - - if (!syscalls_metadata) - return -1; - - for (i = 0; i < NR_syscalls; i++) { - if (syscalls_metadata[i]) { - if (!strcmp(syscalls_metadata[i]->name, name)) - return i; - } - } - return -1; -} - -void set_syscall_enter_id(int num, int id) -{ - syscalls_metadata[num]->enter_id = id; -} - -void set_syscall_exit_id(int num, int id) -{ - syscalls_metadata[num]->exit_id = id; -} - -static int __init arch_init_ftrace_syscalls(void) -{ - int i; - struct syscall_metadata *meta; - unsigned long **psys_syscall_table = &sys_call_table; - - syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - NR_syscalls, GFP_KERNEL); - if (!syscalls_metadata) { - WARN_ON(1); - return -ENOMEM; - } - - for (i = 0; i < NR_syscalls; i++) { - meta = find_syscall_meta(psys_syscall_table[i]); - syscalls_metadata[i] = meta; - } - return 0; + return (unsigned long)(&sys_call_table)[nr]; } -arch_initcall(arch_init_ftrace_syscalls); #endif diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 3f8579f8d42..4f8e2507e8f 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -11,8 +11,21 @@ #include <asm/setup.h> #include <asm/sections.h> #include <asm/e820.h> -#include <asm/bios_ebda.h> +#include <asm/page.h> #include <asm/trampoline.h> +#include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/bios_ebda.h> + +static void __init i386_default_early_setup(void) +{ + /* Initilize 32bit specific setup functions */ + x86_init.resources.probe_roms = probe_roms; + x86_init.resources.reserve_resources = i386_reserve_resources; + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; + + reserve_ebda_region(); +} void __init i386_start_kernel(void) { @@ -29,7 +42,16 @@ void __init i386_start_kernel(void) reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif - reserve_ebda_region(); + + /* Call the subarch specific early setup function */ + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_MRST: + x86_mrst_early_setup(); + break; + default: + i386_default_early_setup(); + break; + } /* * At this point everything still needed from the boot loader diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 70eaa852c73..0b06cd778fd 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -23,8 +23,8 @@ #include <asm/sections.h> #include <asm/kdebug.h> #include <asm/e820.h> -#include <asm/bios_ebda.h> #include <asm/trampoline.h> +#include <asm/bios_ebda.h> static void __init zap_identity_mappings(void) { diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 7ffec6b3b33..050c278481b 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -79,7 +79,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) * any particular GDT layout, because we load our own as soon as we * can. */ -.section .text.head,"ax",@progbits +__HEAD ENTRY(startup_32) /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ @@ -157,6 +157,7 @@ subarch_entries: .long default_entry /* normal x86/PC */ .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ + .long default_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 .previous #endif /* CONFIG_PARAVIRT */ @@ -607,7 +608,7 @@ ENTRY(initial_code) /* * BSS section */ -.section ".bss.page_aligned","wa" +__PAGE_ALIGNED_BSS .align PAGE_SIZE_asm #ifdef CONFIG_X86_PAE swapper_pg_pmd: @@ -625,7 +626,7 @@ ENTRY(empty_zero_page) * This starts the data section. */ #ifdef CONFIG_X86_PAE -.section ".data.page_aligned","wa" +__PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ .align PAGE_SIZE_asm ENTRY(swapper_pg_dir) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index fa54f78e2a0..22db86a3764 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -40,7 +40,7 @@ L4_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) .text - .section .text.head + __HEAD .code64 .globl startup_64 startup_64: @@ -212,8 +212,8 @@ ENTRY(secondary_startup_64) */ lgdt early_gdt_descr(%rip) - /* set up data segments. actually 0 would do too */ - movl $__KERNEL_DS,%eax + /* set up data segments */ + xorl %eax,%eax movl %eax,%ds movl %eax,%ss movl %eax,%es @@ -418,7 +418,7 @@ ENTRY(phys_base) ENTRY(idt_table) .skip IDT_ENTRIES * 16 - .section .bss.page_aligned, "aw", @nobits + __PAGE_ALIGNED_BSS .align PAGE_SIZE ENTRY(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c new file mode 100644 index 00000000000..d42f65ac492 --- /dev/null +++ b/arch/x86/kernel/hw_breakpoint.c @@ -0,0 +1,555 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) 2009 IBM Corporation + * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com> + * + * Authors: Alan Stern <stern@rowland.harvard.edu> + * K.Prasad <prasad@linux.vnet.ibm.com> + * Frederic Weisbecker <fweisbec@gmail.com> + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + */ + +#include <linux/perf_event.h> +#include <linux/hw_breakpoint.h> +#include <linux/irqflags.h> +#include <linux/notifier.h> +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/percpu.h> +#include <linux/kdebug.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/smp.h> + +#include <asm/hw_breakpoint.h> +#include <asm/processor.h> +#include <asm/debugreg.h> + +/* Per cpu debug control register value */ +DEFINE_PER_CPU(unsigned long, cpu_dr7); +EXPORT_PER_CPU_SYMBOL(cpu_dr7); + +/* Per cpu debug address registers values */ +static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); + +/* + * Stores the breakpoints currently in use on each breakpoint address + * register for each cpus + */ +static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); + + +static inline unsigned long +__encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + unsigned long bp_info; + + bp_info = (len | type) & 0xf; + bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); + bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); + + return bp_info; +} + +/* + * Encode the length, type, Exact, and Enable bits for a particular breakpoint + * as stored in debug register 7. + */ +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN; +} + +/* + * Decode the length and type bits for a particular breakpoint as + * stored in debug register 7. Return the "enabled" status. + */ +int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type) +{ + int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); + + *len = (bp_info & 0xc) | 0x40; + *type = (bp_info & 0x3) | 0x80; + + return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; +} + +/* + * Install a perf counter breakpoint. + * + * We seek a free debug address register and use it for this + * breakpoint. Eventually we enable it in the debug control register. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. + */ +int arch_install_hw_breakpoint(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + + if (!*slot) { + *slot = bp; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return -EBUSY; + + set_debugreg(info->address, i); + __get_cpu_var(cpu_debugreg[i]) = info->address; + + dr7 = &__get_cpu_var(cpu_dr7); + *dr7 |= encode_dr7(i, info->len, info->type); + + set_debugreg(*dr7, 7); + + return 0; +} + +/* + * Uninstall the breakpoint contained in the given counter. + * + * First we search the debug address register it uses and then we disable + * it. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. + */ +void arch_uninstall_hw_breakpoint(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + + if (*slot == bp) { + *slot = NULL; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return; + + dr7 = &__get_cpu_var(cpu_dr7); + *dr7 &= ~__encode_dr7(i, info->len, info->type); + + set_debugreg(*dr7, 7); +} + +static int get_hbp_len(u8 hbp_len) +{ + unsigned int len_in_bytes = 0; + + switch (hbp_len) { + case X86_BREAKPOINT_LEN_1: + len_in_bytes = 1; + break; + case X86_BREAKPOINT_LEN_2: + len_in_bytes = 2; + break; + case X86_BREAKPOINT_LEN_4: + len_in_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + len_in_bytes = 8; + break; +#endif + } + return len_in_bytes; +} + +/* + * Check for virtual address in user space. + */ +int arch_check_va_in_userspace(unsigned long va, u8 hbp_len) +{ + unsigned int len; + + len = get_hbp_len(hbp_len); + + return (va <= TASK_SIZE - len); +} + +/* + * Check for virtual address in kernel space. + */ +static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) +{ + unsigned int len; + + len = get_hbp_len(hbp_len); + + return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); +} + +/* + * Store a breakpoint's encoded address, length, and type. + */ +static int arch_store_info(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + /* + * For kernel-addresses, either the address or symbol name can be + * specified. + */ + if (info->name) + info->address = (unsigned long) + kallsyms_lookup_name(info->name); + if (info->address) + return 0; + + return -EINVAL; +} + +int arch_bp_generic_fields(int x86_len, int x86_type, + int *gen_len, int *gen_type) +{ + /* Len */ + switch (x86_len) { + case X86_BREAKPOINT_LEN_1: + *gen_len = HW_BREAKPOINT_LEN_1; + break; + case X86_BREAKPOINT_LEN_2: + *gen_len = HW_BREAKPOINT_LEN_2; + break; + case X86_BREAKPOINT_LEN_4: + *gen_len = HW_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + *gen_len = HW_BREAKPOINT_LEN_8; + break; +#endif + default: + return -EINVAL; + } + + /* Type */ + switch (x86_type) { + case X86_BREAKPOINT_EXECUTE: + *gen_type = HW_BREAKPOINT_X; + break; + case X86_BREAKPOINT_WRITE: + *gen_type = HW_BREAKPOINT_W; + break; + case X86_BREAKPOINT_RW: + *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; + break; + default: + return -EINVAL; + } + + return 0; +} + + +static int arch_build_bp_info(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + + info->address = bp->attr.bp_addr; + + /* Len */ + switch (bp->attr.bp_len) { + case HW_BREAKPOINT_LEN_1: + info->len = X86_BREAKPOINT_LEN_1; + break; + case HW_BREAKPOINT_LEN_2: + info->len = X86_BREAKPOINT_LEN_2; + break; + case HW_BREAKPOINT_LEN_4: + info->len = X86_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case HW_BREAKPOINT_LEN_8: + info->len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: + return -EINVAL; + } + + /* Type */ + switch (bp->attr.bp_type) { + case HW_BREAKPOINT_W: + info->type = X86_BREAKPOINT_WRITE; + break; + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + info->type = X86_BREAKPOINT_RW; + break; + case HW_BREAKPOINT_X: + info->type = X86_BREAKPOINT_EXECUTE; + break; + default: + return -EINVAL; + } + + return 0; +} +/* + * Validate the arch-specific HW Breakpoint register settings + */ +int arch_validate_hwbkpt_settings(struct perf_event *bp, + struct task_struct *tsk) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned int align; + int ret; + + + ret = arch_build_bp_info(bp); + if (ret) + return ret; + + ret = -EINVAL; + + if (info->type == X86_BREAKPOINT_EXECUTE) + /* + * Ptrace-refactoring code + * For now, we'll allow instruction breakpoint only for user-space + * addresses + */ + if ((!arch_check_va_in_userspace(info->address, info->len)) && + info->len != X86_BREAKPOINT_EXECUTE) + return ret; + + switch (info->len) { + case X86_BREAKPOINT_LEN_1: + align = 0; + break; + case X86_BREAKPOINT_LEN_2: + align = 1; + break; + case X86_BREAKPOINT_LEN_4: + align = 3; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + align = 7; + break; +#endif + default: + return ret; + } + + if (bp->callback) + ret = arch_store_info(bp); + + if (ret < 0) + return ret; + /* + * Check that the low-order bits of the address are appropriate + * for the alignment implied by len. + */ + if (info->address & align) + return -EINVAL; + + /* Check that the virtual address is in the proper range */ + if (tsk) { + if (!arch_check_va_in_userspace(info->address, info->len)) + return -EFAULT; + } else { + if (!arch_check_va_in_kernelspace(info->address, info->len)) + return -EFAULT; + } + + return 0; +} + +/* + * Dump the debug register contents to the user. + * We can't dump our per cpu values because it + * may contain cpu wide breakpoint, something that + * doesn't belong to the current task. + * + * TODO: include non-ptrace user breakpoints (perf) + */ +void aout_dump_debugregs(struct user *dump) +{ + int i; + int dr7 = 0; + struct perf_event *bp; + struct arch_hw_breakpoint *info; + struct thread_struct *thread = ¤t->thread; + + for (i = 0; i < HBP_NUM; i++) { + bp = thread->ptrace_bps[i]; + + if (bp && !bp->attr.disabled) { + dump->u_debugreg[i] = bp->attr.bp_addr; + info = counter_arch_bp(bp); + dr7 |= encode_dr7(i, info->len, info->type); + } else { + dump->u_debugreg[i] = 0; + } + } + + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + + dump->u_debugreg[7] = dr7; +} +EXPORT_SYMBOL_GPL(aout_dump_debugregs); + +/* + * Release the user breakpoints used by ptrace + */ +void flush_ptrace_hw_breakpoint(struct task_struct *tsk) +{ + int i; + struct thread_struct *t = &tsk->thread; + + for (i = 0; i < HBP_NUM; i++) { + unregister_hw_breakpoint(t->ptrace_bps[i]); + t->ptrace_bps[i] = NULL; + } +} + +void hw_breakpoint_restore(void) +{ + set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); + set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1); + set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); + set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(__get_cpu_var(cpu_dr7), 7); +} +EXPORT_SYMBOL_GPL(hw_breakpoint_restore); + +/* + * Handle debug exception notifications. + * + * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below. + * + * NOTIFY_DONE returned if one of the following conditions is true. + * i) When the causative address is from user-space and the exception + * is a valid one, i.e. not triggered as a result of lazy debug register + * switching + * ii) When there are more bits than trap<n> set in DR6 register (such + * as BD, BS or BT) indicating that more than one debug condition is + * met and requires some more action in do_debug(). + * + * NOTIFY_STOP returned for all other cases + * + */ +static int __kprobes hw_breakpoint_handler(struct die_args *args) +{ + int i, cpu, rc = NOTIFY_STOP; + struct perf_event *bp; + unsigned long dr7, dr6; + unsigned long *dr6_p; + + /* The DR6 value is pointed by args->err */ + dr6_p = (unsigned long *)ERR_PTR(args->err); + dr6 = *dr6_p; + + /* Do an early return if no trap bits are set in DR6 */ + if ((dr6 & DR_TRAP_BITS) == 0) + return NOTIFY_DONE; + + get_debugreg(dr7, 7); + /* Disable breakpoints during exception handling */ + set_debugreg(0UL, 7); + /* + * Assert that local interrupts are disabled + * Reset the DRn bits in the virtualized register value. + * The ptrace trigger routine will add in whatever is needed. + */ + current->thread.debugreg6 &= ~DR_TRAP_BITS; + cpu = get_cpu(); + + /* Handle all the breakpoints that were triggered */ + for (i = 0; i < HBP_NUM; ++i) { + if (likely(!(dr6 & (DR_TRAP0 << i)))) + continue; + + /* + * The counter may be concurrently released but that can only + * occur from a call_rcu() path. We can then safely fetch + * the breakpoint, use its callback, touch its counter + * while we are in an rcu_read_lock() path. + */ + rcu_read_lock(); + + bp = per_cpu(bp_per_reg[i], cpu); + if (bp) + rc = NOTIFY_DONE; + /* + * Reset the 'i'th TRAP bit in dr6 to denote completion of + * exception handling + */ + (*dr6_p) &= ~(DR_TRAP0 << i); + /* + * bp can be NULL due to lazy debug register switching + * or due to concurrent perf counter removing. + */ + if (!bp) { + rcu_read_unlock(); + break; + } + + (bp->callback)(bp, args->regs); + + rcu_read_unlock(); + } + if (dr6 & (~DR_TRAP_BITS)) + rc = NOTIFY_DONE; + + set_debugreg(dr7, 7); + put_cpu(); + + return rc; +} + +/* + * Handle debug exception notifications. + */ +int __kprobes hw_breakpoint_exceptions_notify( + struct notifier_block *unused, unsigned long val, void *data) +{ + if (val != DIE_DEBUG) + return NOTIFY_DONE; + + return hw_breakpoint_handler(data); +} + +void hw_breakpoint_pmu_read(struct perf_event *bp) +{ + /* TODO */ +} + +void hw_breakpoint_pmu_unthrottle(struct perf_event *bp) +{ + /* TODO */ +} diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 43cec6bdda6..9c3bd4a2050 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -10,6 +10,16 @@ EXPORT_SYMBOL(mcount); #endif +/* + * Note, this is a prototype to get at the symbol for + * the export, but dont use it from C code, it is used + * by assembly code and is not using C calling convention! + */ +#ifndef CONFIG_X86_CMPXCHG64 +extern void cmpxchg8b_emu(void); +EXPORT_SYMBOL(cmpxchg8b_emu); +#endif + /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 5cf36c053ac..23c167925a5 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -19,12 +19,6 @@ DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -#ifdef CONFIG_X86_32 -static void pit_disable_clocksource(void); -#else -static inline void pit_disable_clocksource(void) { } -#endif - /* * HPET replaces the PIT, when enabled. So we need to know, which of * the two timers is used @@ -57,12 +51,10 @@ static void init_pit_timer(enum clock_event_mode mode, outb_pit(0, PIT_CH0); outb_pit(0, PIT_CH0); } - pit_disable_clocksource(); break; case CLOCK_EVT_MODE_ONESHOT: /* One shot setup */ - pit_disable_clocksource(); outb_pit(0x38, PIT_MODE); break; @@ -200,17 +192,6 @@ static struct clocksource pit_cs = { .shift = 20, }; -static void pit_disable_clocksource(void) -{ - /* - * Use mult to check whether it is registered or not - */ - if (pit_cs.mult) { - clocksource_unregister(&pit_cs); - pit_cs.mult = 0; - } -} - static int __init init_pit_clocksource(void) { /* diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 270ff83efc1..3a54dcb9cd0 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b0cdde6932f..fee6cc2b207 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -63,10 +63,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "%*s: ", prec, "CNT"); + seq_printf(p, "%*s: ", prec, "PMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); - seq_printf(p, " Performance counter interrupts\n"); + seq_printf(p, " Performance monitoring interrupts\n"); seq_printf(p, "%*s: ", prec, "PND"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); @@ -92,19 +92,19 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); seq_printf(p, " TLB shootdowns\n"); #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR seq_printf(p, "%*s: ", prec, "TRM"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); -# ifdef CONFIG_X86_MCE_THRESHOLD +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); -# endif #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); @@ -194,13 +194,13 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_call_count; sum += irq_stats(cpu)->irq_tlb_count; #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR sum += irq_stats(cpu)->irq_thermal_count; -# ifdef CONFIG_X86_MCE_THRESHOLD +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; -# endif #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); sum += per_cpu(mce_poll_count, cpu); #endif @@ -274,3 +274,93 @@ void smp_generic_interrupt(struct pt_regs *regs) } EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); + +#ifdef CONFIG_HOTPLUG_CPU +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) +{ + unsigned int irq, vector; + static int warned; + struct irq_desc *desc; + + for_each_irq_desc(irq, desc) { + int break_affinity = 0; + int set_affinity = 1; + const struct cpumask *affinity; + + if (!desc) + continue; + if (irq == 2) + continue; + + /* interrupt's are disabled at this point */ + spin_lock(&desc->lock); + + affinity = desc->affinity; + if (!irq_has_action(irq) || + cpumask_equal(affinity, cpu_online_mask)) { + spin_unlock(&desc->lock); + continue; + } + + /* + * Complete the irq move. This cpu is going down and for + * non intr-remapping case, we can't wait till this interrupt + * arrives at this cpu before completing the irq move. + */ + irq_force_complete_move(irq); + + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + break_affinity = 1; + affinity = cpu_all_mask; + } + + if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask) + desc->chip->mask(irq); + + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, affinity); + else if (!(warned++)) + set_affinity = 0; + + if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask) + desc->chip->unmask(irq); + + spin_unlock(&desc->lock); + + if (break_affinity && set_affinity) + printk("Broke affinity for irq %i\n", irq); + else if (!set_affinity) + printk("Cannot set affinity for irq %i\n", irq); + } + + /* + * We can remove mdelay() and then send spuriuous interrupts to + * new cpu targets for all the irqs that were handled previously by + * this cpu. While it works, I have seen spurious interrupt messages + * (nothing wrong but still...). + * + * So for now, retain mdelay(1) and check the IRR and then send those + * interrupts to new targets as this cpu is already offlined... + */ + mdelay(1); + + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + unsigned int irr; + + if (__get_cpu_var(vector_irq)[vector] < 0) + continue; + + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + if (irr & (1 << (vector % 32))) { + irq = __get_cpu_var(vector_irq)[vector]; + + desc = irq_to_desc(irq); + spin_lock(&desc->lock); + if (desc->chip->retrigger) + desc->chip->retrigger(irq); + spin_unlock(&desc->lock); + } + } +} +#endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 7d35d0fe232..10709f29d16 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) return true; } - -#ifdef CONFIG_HOTPLUG_CPU - -/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ -void fixup_irqs(void) -{ - unsigned int irq; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - const struct cpumask *affinity; - - if (!desc) - continue; - if (irq == 2) - continue; - - affinity = desc->affinity; - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - printk("Breaking affinity for irq %i\n", irq); - affinity = cpu_all_mask; - } - if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, affinity); - else if (desc->action) - printk_once("Cannot set affinity for irq %i\n", irq); - } - -#if 0 - barrier(); - /* Ingo Molnar says: "after the IO-APIC masks have been redirected - [note the nop - the interrupt-enable boundary on x86 is two - instructions from sti] - to flush out pending hardirqs and - IPIs. After this point nothing is supposed to reach this CPU." */ - __asm__ __volatile__("sti; nop; cli"); - barrier(); -#else - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); -#endif -} -#endif - diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 977d8b43a0d..acf8fbf8fbd 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) return true; } -#ifdef CONFIG_HOTPLUG_CPU -/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ -void fixup_irqs(void) -{ - unsigned int irq; - static int warned; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - int break_affinity = 0; - int set_affinity = 1; - const struct cpumask *affinity; - - if (!desc) - continue; - if (irq == 2) - continue; - - /* interrupt's are disabled at this point */ - spin_lock(&desc->lock); - - affinity = desc->affinity; - if (!irq_has_action(irq) || - cpumask_equal(affinity, cpu_online_mask)) { - spin_unlock(&desc->lock); - continue; - } - - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - break_affinity = 1; - affinity = cpu_all_mask; - } - - if (desc->chip->mask) - desc->chip->mask(irq); - - if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, affinity); - else if (!(warned++)) - set_affinity = 0; - - if (desc->chip->unmask) - desc->chip->unmask(irq); - - spin_unlock(&desc->lock); - - if (break_affinity && set_affinity) - printk("Broke affinity for irq %i\n", irq); - else if (!set_affinity) - printk("Cannot set affinity for irq %i\n", irq); - } - - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); -} -#endif extern void call_softirq(void); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 92b7703d3d5..40f30773fb2 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -116,7 +116,7 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } -static void __init init_ISA_irqs(void) +void __init init_ISA_irqs(void) { int i; @@ -140,8 +140,10 @@ static void __init init_ISA_irqs(void) } } -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); +void __init init_IRQ(void) +{ + x86_init.irqs.intr_init(); +} static void __init smp_intr_init(void) { @@ -190,7 +192,7 @@ static void __init apic_intr_init(void) #ifdef CONFIG_X86_MCE_THRESHOLD alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif -#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC) alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); #endif @@ -206,39 +208,19 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); /* Performance monitoring interrupts: */ -# ifdef CONFIG_PERF_COUNTERS +# ifdef CONFIG_PERF_EVENTS alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); # endif #endif } -/** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -static void __init x86_quirk_pre_intr_init(void) -{ -#ifdef CONFIG_X86_32 - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } -#endif - init_ISA_irqs(); -} - void __init native_init_IRQ(void) { int i; /* Execute any quirks before the call gates are initialised: */ - x86_quirk_pre_intr_init(); + x86_init.irqs.pre_vector_init(); apic_intr_init(); @@ -258,12 +240,6 @@ void __init native_init_IRQ(void) #ifdef CONFIG_X86_32 /* - * Call quirks after call gates are initialised (usually add in - * the architecture specific gates): - */ - x86_quirk_intr_init(); - - /* * External FPU? Set up irq13 if so, for * original braindamaged IBM FERR coupling. */ diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 8d82a77a3f3..20a5b368946 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -43,6 +43,7 @@ #include <linux/smp.h> #include <linux/nmi.h> +#include <asm/debugreg.h> #include <asm/apicdef.h> #include <asm/system.h> @@ -88,7 +89,6 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; - gdb_regs[GDB_SP] = (int)®s->sp; #else gdb_regs[GDB_R8] = regs->r8; gdb_regs[GDB_R9] = regs->r9; @@ -101,8 +101,8 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs32[GDB_PS] = regs->flags; gdb_regs32[GDB_CS] = regs->cs; gdb_regs32[GDB_SS] = regs->ss; - gdb_regs[GDB_SP] = regs->sp; #endif + gdb_regs[GDB_SP] = kernel_stack_pointer(regs); } /** @@ -434,6 +434,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args) "resuming...\n"); kgdb_arch_handle_exception(args->trapnr, args->signr, args->err, "c", "", regs); + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; return NOTIFY_STOP; } diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d2b00..1f3186ce213 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -48,31 +48,22 @@ #include <linux/preempt.h> #include <linux/module.h> #include <linux/kdebug.h> +#include <linux/kallsyms.h> #include <asm/cacheflush.h> #include <asm/desc.h> #include <asm/pgtable.h> #include <asm/uaccess.h> #include <asm/alternative.h> +#include <asm/insn.h> +#include <asm/debugreg.h> void jprobe_return_end(void); DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#ifdef CONFIG_X86_64 -#define stack_addr(regs) ((unsigned long *)regs->sp) -#else -/* - * "®s->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs - * don't save the ss and esp registers if the CPU is already in kernel - * mode when it traps. So for kprobes, regs->sp and regs->ss are not - * the [nonexistent] saved stack pointer and ss register, but rather - * the top 8 bytes of the pre-int3 stack. So ®s->sp happens to - * point to the top of the pre-int3 stack. - */ -#define stack_addr(regs) ((unsigned long *)®s->sp) -#endif +#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs)) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ @@ -106,50 +97,6 @@ static const u32 twobyte_is_boostable[256 / 32] = { /* ----------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; -static const u32 onebyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */ - W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */ - W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */ - W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ - W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ - W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */ - W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */ - W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ - W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */ - W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */ - W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */ - W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */ - W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ - W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; -static const u32 twobyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */ - W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */ - W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */ - W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */ - W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */ - W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */ - W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */ - W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */ - W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */ - W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */ - W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */ - W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */ - W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */ - W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */ - W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */ - W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; #undef W struct kretprobe_blackpoint kretprobe_blacklist[] = { @@ -244,6 +191,75 @@ retry: } } +/* Recover the probed instruction at addr for further analysis. */ +static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) +{ + struct kprobe *kp; + kp = get_kprobe((void *)addr); + if (!kp) + return -EINVAL; + + /* + * Basically, kp->ainsn.insn has an original instruction. + * However, RIP-relative instruction can not do single-stepping + * at different place, fix_riprel() tweaks the displacement of + * that instruction. In that case, we can't recover the instruction + * from the kp->ainsn.insn. + * + * On the other hand, kp->opcode has a copy of the first byte of + * the probed instruction, which is overwritten by int3. And + * the instruction at kp->addr is not modified by kprobes except + * for the first byte, we can recover the original instruction + * from it and kp->opcode. + */ + memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + buf[0] = kp->opcode; + return 0; +} + +/* Dummy buffers for kallsyms_lookup */ +static char __dummy_buf[KSYM_NAME_LEN]; + +/* Check if paddr is at an instruction boundary */ +static int __kprobes can_probe(unsigned long paddr) +{ + int ret; + unsigned long addr, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf)) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + kernel_insn_init(&insn, (void *)addr); + insn_get_opcode(&insn); + + /* + * Check if the instruction has been modified by another + * kprobe, in which case we replace the breakpoint by the + * original instruction in our buffer. + */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, addr); + if (ret) + /* + * Another debugging subsystem might insert + * this breakpoint. In that case, we can't + * recover it. + */ + return 0; + kernel_insn_init(&insn, buf); + } + insn_get_length(&insn); + addr += insn.length; + } + + return (addr == paddr); +} + /* * Returns non-zero if opcode modifies the interrupt flag. */ @@ -277,68 +293,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) static void __kprobes fix_riprel(struct kprobe *p) { #ifdef CONFIG_X86_64 - u8 *insn = p->ainsn.insn; - s64 disp; - int need_modrm; - - /* Skip legacy instruction prefixes. */ - while (1) { - switch (*insn) { - case 0x66: - case 0x67: - case 0x2e: - case 0x3e: - case 0x26: - case 0x64: - case 0x65: - case 0x36: - case 0xf0: - case 0xf3: - case 0xf2: - ++insn; - continue; - } - break; - } + struct insn insn; + kernel_insn_init(&insn, p->ainsn.insn); - /* Skip REX instruction prefix. */ - if (is_REX_prefix(insn)) - ++insn; - - if (*insn == 0x0f) { - /* Two-byte opcode. */ - ++insn; - need_modrm = test_bit(*insn, - (unsigned long *)twobyte_has_modrm); - } else - /* One-byte opcode. */ - need_modrm = test_bit(*insn, - (unsigned long *)onebyte_has_modrm); - - if (need_modrm) { - u8 modrm = *++insn; - if ((modrm & 0xc7) == 0x05) { - /* %rip+disp32 addressing mode */ - /* Displacement follows ModRM byte. */ - ++insn; - /* - * The copied instruction uses the %rip-relative - * addressing mode. Adjust the displacement for the - * difference between the original location of this - * instruction and the location of the copy that will - * actually be run. The tricky bit here is making sure - * that the sign extension happens correctly in this - * calculation, since we need a signed 32-bit result to - * be sign-extended to 64 bits when it's added to the - * %rip value and yield the same 64-bit result that the - * sign-extension of the original signed 32-bit - * displacement would have given. - */ - disp = (u8 *) p->addr + *((s32 *) insn) - - (u8 *) p->ainsn.insn; - BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ - *(s32 *)insn = (s32) disp; - } + if (insn_rip_relative(&insn)) { + s64 newdisp; + u8 *disp; + insn_get_displacement(&insn); + /* + * The copied instruction uses the %rip-relative addressing + * mode. Adjust the displacement for the difference between + * the original location of this instruction and the location + * of the copy that will actually be run. The tricky bit here + * is making sure that the sign extension happens correctly in + * this calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the %rip + * value and yield the same 64-bit result that the sign- + * extension of the original signed 32-bit displacement would + * have given. + */ + newdisp = (u8 *) p->addr + (s64) insn.displacement.value - + (u8 *) p->ainsn.insn; + BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ + disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); + *(s32 *) disp = (s32) newdisp; } #endif } @@ -359,6 +337,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) int __kprobes arch_prepare_kprobe(struct kprobe *p) { + if (!can_probe((unsigned long)p->addr)) + return -EILSEQ; /* insn: must be on special executable page on x86. */ p->ainsn.insn = get_insn_slot(); if (!p->ainsn.insn) @@ -472,17 +452,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, { switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: -#ifdef CONFIG_X86_64 - /* TODO: Provide re-entrancy from post_kprobes_handler() and - * avoid exception stack corruption while single-stepping on - * the instruction of the new probe. - */ - arch_disarm_kprobe(p); - regs->ip = (unsigned long)p->addr; - reset_current_kprobe(); - preempt_enable_no_resched(); - break; -#endif case KPROBE_HIT_ACTIVE: save_previous_kprobe(kcb); set_current_kprobe(p, regs, kcb); @@ -491,18 +460,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, kcb->kprobe_status = KPROBE_REENTER; break; case KPROBE_HIT_SS: - if (p == kprobe_running()) { - regs->flags &= ~X86_EFLAGS_TF; - regs->flags |= kcb->kprobe_saved_flags; - return 0; - } else { - /* A probe has been hit in the codepath leading up - * to, or just after, single-stepping of a probed - * instruction. This entire codepath should strictly - * reside in .kprobes.text section. Raise a warning - * to highlight this peculiar case. - */ - } + /* A probe has been hit in the codepath leading up to, or just + * after, single-stepping of a probed instruction. This entire + * codepath should strictly reside in .kprobes.text section. + * Raise a BUG or we'll continue in an endless reentering loop + * and eventually a stack overflow. + */ + printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n", + p->addr); + dump_kprobe(p); + BUG(); default: /* impossible cases */ WARN_ON(1); @@ -967,8 +934,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, ret = NOTIFY_STOP; break; case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) + if (post_kprobe_handler(args->regs)) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; ret = NOTIFY_STOP; + } break; case DIE_GPF: /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index c664d515f61..63b0ec8d3d4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -34,7 +34,6 @@ struct kvm_para_state { u8 mmu_queue[MMU_QUEUE_SIZE]; int mmu_queue_len; - enum paravirt_lazy_mode mode; }; static DEFINE_PER_CPU(struct kvm_para_state, para_state); @@ -77,7 +76,7 @@ static void kvm_deferred_mmu_op(void *buffer, int len) { struct kvm_para_state *state = kvm_para_state(); - if (state->mode != PARAVIRT_LAZY_MMU) { + if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) { kvm_mmu_op(buffer, len); return; } @@ -185,10 +184,7 @@ static void kvm_release_pt(unsigned long pfn) static void kvm_enter_lazy_mmu(void) { - struct kvm_para_state *state = kvm_para_state(); - paravirt_enter_lazy_mmu(); - state->mode = paravirt_get_lazy_mode(); } static void kvm_leave_lazy_mmu(void) @@ -197,7 +193,6 @@ static void kvm_leave_lazy_mmu(void) mmu_queue_flush(state); paravirt_leave_lazy_mmu(); - state->mode = paravirt_get_lazy_mode(); } static void __init paravirt_ops_setup(void) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 223af43f152..feaeb0d3aa4 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -22,6 +22,8 @@ #include <asm/msr.h> #include <asm/apic.h> #include <linux/percpu.h> + +#include <asm/x86_init.h> #include <asm/reboot.h> #define KVM_SCALE 22 @@ -50,8 +52,8 @@ static unsigned long kvm_get_wallclock(void) struct timespec ts; int low, high; - low = (int)__pa(&wall_clock); - high = ((u64)__pa(&wall_clock) >> 32); + low = (int)__pa_symbol(&wall_clock); + high = ((u64)__pa_symbol(&wall_clock) >> 32); native_write_msr(MSR_KVM_WALL_CLOCK, low, high); vcpu_time = &get_cpu_var(hv_clock); @@ -182,12 +184,13 @@ void __init kvmclock_init(void) if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { if (kvm_register_clock("boot clock")) return; - pv_time_ops.get_wallclock = kvm_get_wallclock; - pv_time_ops.set_wallclock = kvm_set_wallclock; pv_time_ops.sched_clock = kvm_clock_read; - pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; + x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.get_wallclock = kvm_get_wallclock; + x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; + x86_cpuinit.setup_percpu_clockev = + kvm_setup_secondary_clock; #endif #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 71f1d99a635..ec6ef60cbd1 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -67,8 +67,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) #ifdef CONFIG_SMP preempt_disable(); load_LDT(pc); - if (!cpus_equal(current->mm->cpu_vm_mask, - cpumask_of_cpu(smp_processor_id()))) + if (!cpumask_equal(mm_cpumask(current->mm), + cpumask_of(smp_processor_id()))) smp_call_function(flush_ldt, current->mm, 1); preempt_enable(); #else diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index c1c429d0013..c843f8406da 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -25,6 +25,7 @@ #include <asm/desc.h> #include <asm/system.h> #include <asm/cacheflush.h> +#include <asm/debugreg.h> static void set_idt(void *newidt, __u16 limit) { @@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 84c3bf209e9..4a8bb82248a 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -18,6 +18,7 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> +#include <asm/debugreg.h> static int init_one_level2_page(struct kimage *image, pgd_t *pgd, unsigned long addr) @@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 366baa17991..f4c538b681c 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -317,6 +317,12 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) return UCODE_NFOUND; } + if (*(u32 *)firmware->data != UCODE_MAGIC) { + printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n", + *(u32 *)firmware->data); + return UCODE_ERROR; + } + ret = generic_load_microcode(cpu, firmware->data, firmware->size); release_firmware(firmware); diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 9371448290a..2bcad3926ed 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -73,7 +73,6 @@ #include <linux/platform_device.h> #include <linux/miscdevice.h> #include <linux/capability.h> -#include <linux/smp_lock.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> @@ -201,7 +200,6 @@ static int do_microcode_update(const void __user *buf, size_t size) static int microcode_open(struct inode *unused1, struct file *unused2) { - cycle_kernel_lock(); return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; } @@ -210,8 +208,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, { ssize_t ret = -EINVAL; - if ((len >> PAGE_SHIFT) > num_physpages) { - pr_err("microcode: too much data (max %ld pages)\n", num_physpages); + if ((len >> PAGE_SHIFT) > totalram_pages) { + pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); return ret; } @@ -236,7 +234,7 @@ static const struct file_operations microcode_fops = { static struct miscdevice microcode_dev = { .minor = MICROCODE_MINOR, .name = "microcode", - .devnode = "cpu/microcode", + .nodename = "cpu/microcode", .fops = µcode_fops, }; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index fcd513bf284..5be95ef4ffe 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -45,6 +45,11 @@ static int __init mpf_checksum(unsigned char *mp, int len) return sum & 0xFF; } +int __init default_mpc_apic_id(struct mpc_cpu *m) +{ + return m->apicid; +} + static void __init MP_processor_info(struct mpc_cpu *m) { int apicid; @@ -55,10 +60,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) return; } - if (x86_quirks->mpc_apic_id) - apicid = x86_quirks->mpc_apic_id(m); - else - apicid = m->apicid; + apicid = x86_init.mpparse.mpc_apic_id(m); if (m->cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; @@ -70,16 +72,18 @@ static void __init MP_processor_info(struct mpc_cpu *m) } #ifdef CONFIG_X86_IO_APIC -static void __init MP_bus_info(struct mpc_bus *m) +void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str) { - char str[7]; memcpy(str, m->bustype, 6); str[6] = 0; + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +} - if (x86_quirks->mpc_oem_bus_info) - x86_quirks->mpc_oem_bus_info(m, str); - else - apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +static void __init MP_bus_info(struct mpc_bus *m) +{ + char str[7]; + + x86_init.mpparse.mpc_oem_bus_info(m, str); #if MAX_MP_BUSSES < 256 if (m->busid >= MAX_MP_BUSSES) { @@ -96,8 +100,8 @@ static void __init MP_bus_info(struct mpc_bus *m) mp_bus_id_to_type[m->busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { - if (x86_quirks->mpc_oem_pci_bus) - x86_quirks->mpc_oem_pci_bus(m); + if (x86_init.mpparse.mpc_oem_pci_bus) + x86_init.mpparse.mpc_oem_pci_bus(m); clear_bit(m->busid, mp_bus_not_pci); #if defined(CONFIG_EISA) || defined(CONFIG_MCA) @@ -291,6 +295,8 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) 1, mpc, mpc->length, 1); } +void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } + static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -312,16 +318,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (early) return 1; - if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) { - struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr; - x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize); - } + if (mpc->oemptr) + x86_init.mpparse.smp_read_mpc_oem(mpc); /* * Now process the configuration blocks. */ - if (x86_quirks->mpc_record) - *x86_quirks->mpc_record = 0; + x86_init.mpparse.mpc_record(0); while (count < mpc->length) { switch (*mpt) { @@ -353,8 +356,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) count = mpc->length; break; } - if (x86_quirks->mpc_record) - (*x86_quirks->mpc_record)++; + x86_init.mpparse.mpc_record(1); } #ifdef CONFIG_X86_BIGSMP @@ -608,7 +610,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) /* * Scan the memory blocks for an SMP configuration block. */ -static void __init __get_smp_config(unsigned int early) +void __init default_get_smp_config(unsigned int early) { struct mpf_intel *mpf = mpf_found; @@ -625,11 +627,6 @@ static void __init __get_smp_config(unsigned int early) if (acpi_lapic && acpi_ioapic) return; - if (x86_quirks->mach_get_smp_config) { - if (x86_quirks->mach_get_smp_config(early)) - return; - } - printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->specification); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) @@ -670,16 +667,6 @@ static void __init __get_smp_config(unsigned int early) */ } -void __init early_get_smp_config(void) -{ - __get_smp_config(1); -} - -void __init get_smp_config(void) -{ - __get_smp_config(0); -} - static void __init smp_reserve_bootmem(struct mpf_intel *mpf) { unsigned long size = get_mpc_size(mpf->physptr); @@ -745,14 +732,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, return 0; } -static void __init __find_smp_config(unsigned int reserve) +void __init default_find_smp_config(unsigned int reserve) { unsigned int address; - if (x86_quirks->mach_find_smp_config) { - if (x86_quirks->mach_find_smp_config(reserve)) - return; - } /* * FIXME: Linux assumes you have 640K of base ram.. * this continues the error... @@ -787,16 +770,6 @@ static void __init __find_smp_config(unsigned int reserve) smp_scan_config(address, 0x400, reserve); } -void __init early_find_smp_config(void) -{ - __find_smp_config(0); -} - -void __init find_smp_config(void) -{ - __find_smp_config(1); -} - #ifdef CONFIG_X86_IO_APIC static u8 __initdata irq_used[MAX_IRQ_SOURCES]; diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c new file mode 100644 index 00000000000..3b7078abc87 --- /dev/null +++ b/arch/x86/kernel/mrst.c @@ -0,0 +1,24 @@ +/* + * mrst.c: Intel Moorestown platform specific setup code + * + * (C) Copyright 2008 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/init.h> + +#include <asm/setup.h> + +/* + * Moorestown specific x86_init function overrides and early setup + * calls. + */ +void __init x86_mrst_early_setup(void) +{ + x86_init.resources.probe_roms = x86_init_noop; + x86_init.resources.reserve_resources = x86_init_noop; +} diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index c0061096323..553449951b8 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -237,7 +237,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; -static char *msr_nodename(struct device *dev) +static char *msr_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); } @@ -258,7 +258,7 @@ static int __init msr_init(void) err = PTR_ERR(msr_class); goto out_chrdev; } - msr_class->nodename = msr_nodename; + msr_class->devnode = msr_devnode; for_each_online_cpu(i) { err = msr_device_create(i); if (err != 0) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f5b0b4a01fb..1b1739d1631 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -54,17 +54,12 @@ u64 _paravirt_ident_64(u64 x) return x; } -static void __init default_banner(void) +void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", pv_info.name); } -char *memory_setup(void) -{ - return pv_init_ops.memory_setup(); -} - /* Simple instruction patching code. */ #define DEF_NATIVE(ops, name, code) \ extern const char start_##ops##_##name[], end_##ops##_##name[]; \ @@ -188,11 +183,6 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len, return insn_len; } -void init_IRQ(void) -{ - pv_irq_ops.init_IRQ(); -} - static void native_flush_tlb(void) { __native_flush_tlb(); @@ -218,13 +208,6 @@ extern void native_irq_enable_sysexit(void); extern void native_usergs_sysret32(void); extern void native_usergs_sysret64(void); -static int __init print_banner(void) -{ - pv_init_ops.banner(); - return 0; -} -core_initcall(print_banner); - static struct resource reserve_ioports = { .start = 0, .end = IO_SPACE_LIMIT, @@ -320,21 +303,13 @@ struct pv_info pv_info = { struct pv_init_ops pv_init_ops = { .patch = native_patch, - .banner = default_banner, - .arch_setup = paravirt_nop, - .memory_setup = machine_specific_memory_setup, }; struct pv_time_ops pv_time_ops = { - .time_init = hpet_time_init, - .get_wallclock = native_get_wallclock, - .set_wallclock = native_set_wallclock, .sched_clock = native_sched_clock, - .get_tsc_khz = native_calibrate_tsc, }; struct pv_irq_ops pv_irq_ops = { - .init_IRQ = native_init_IRQ, .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), @@ -409,8 +384,6 @@ struct pv_cpu_ops pv_cpu_ops = { struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC - .setup_boot_clock = setup_boot_APIC_clock, - .setup_secondary_clock = setup_secondary_APIC_clock, .startup_ipi_hook = paravirt_nop, #endif }; @@ -424,13 +397,6 @@ struct pv_apic_ops pv_apic_ops = { #endif struct pv_mmu_ops pv_mmu_ops = { -#ifndef CONFIG_X86_64 - .pagetable_setup_start = native_pagetable_setup_start, - .pagetable_setup_done = native_pagetable_setup_done, -#else - .pagetable_setup_start = paravirt_nop, - .pagetable_setup_done = paravirt_nop, -#endif .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 971a3bec47a..c563e4c8ff3 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -46,6 +46,7 @@ #include <asm/dma.h> #include <asm/rio.h> #include <asm/bios_ebda.h> +#include <asm/x86_init.h> #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT int use_calgary __read_mostly = 1; @@ -244,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev, if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); else - return bad_dma_address; + return DMA_ERROR_CODE; } } @@ -260,12 +261,15 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, void *vaddr, unsigned int npages, int direction) { unsigned long entry; - dma_addr_t ret = bad_dma_address; + dma_addr_t ret; entry = iommu_range_alloc(dev, tbl, npages); - if (unlikely(entry == bad_dma_address)) - goto error; + if (unlikely(entry == DMA_ERROR_CODE)) { + printk(KERN_WARNING "Calgary: failed to allocate %u pages in " + "iommu %p\n", npages, tbl); + return DMA_ERROR_CODE; + } /* set the return dma address */ ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); @@ -273,13 +277,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, /* put the TCEs in the HW table */ tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, direction); - return ret; - -error: - printk(KERN_WARNING "Calgary: failed to allocate %u pages in " - "iommu %p\n", npages, tbl); - return bad_dma_address; } static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, @@ -290,8 +288,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned long flags; /* were we called with bad_dma_address? */ - badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); - if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { + badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE); + if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) { WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " "address 0x%Lx\n", dma_addr); return; @@ -318,13 +316,15 @@ static inline struct iommu_table *find_iommu_table(struct device *dev) pdev = to_pci_dev(dev); + /* search up the device tree for an iommu */ pbus = pdev->bus; - - /* is the device behind a bridge? Look for the root bus */ - while (pbus->parent) + do { + tbl = pci_iommu(pbus); + if (tbl && tbl->it_busno == pbus->number) + break; + tbl = NULL; pbus = pbus->parent; - - tbl = pci_iommu(pbus); + } while (pbus); BUG_ON(tbl && (tbl->it_busno != pbus->number)); @@ -373,7 +373,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); entry = iommu_range_alloc(dev, tbl, npages); - if (entry == bad_dma_address) { + if (entry == DMA_ERROR_CODE) { /* makes sure unmap knows to stop */ s->dma_length = 0; goto error; @@ -391,7 +391,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, error: calgary_unmap_sg(dev, sg, nelems, dir, NULL); for_each_sg(sg, s, nelems, i) { - sg->dma_address = bad_dma_address; + sg->dma_address = DMA_ERROR_CODE; sg->dma_length = 0; } return 0; @@ -446,7 +446,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, /* set up tces to cover the allocated range */ mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == bad_dma_address) + if (mapping == DMA_ERROR_CODE) goto free; *dma_handle = mapping; return ret; @@ -727,7 +727,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) struct iommu_table *tbl = pci_iommu(dev->bus); /* reserve EMERGENCY_PAGES from bad_dma_address and up */ - iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); + iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES); /* avoid the BIOS/VGA first 640KB-1MB region */ /* for CalIOC2 - avoid the entire first MB */ @@ -1344,6 +1344,23 @@ static void __init get_tce_space_from_tar(void) return; } +static int __init calgary_iommu_init(void) +{ + int ret; + + /* ok, we're trying to use Calgary - let's roll */ + printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); + + ret = calgary_init(); + if (ret) { + printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " + "falling back to no_iommu\n", ret); + return ret; + } + + return 0; +} + void __init detect_calgary(void) { int bus; @@ -1357,7 +1374,7 @@ void __init detect_calgary(void) * if the user specified iommu=off or iommu=soft or we found * another HW IOMMU already, bail out. */ - if (swiotlb || no_iommu || iommu_detected) + if (no_iommu || iommu_detected) return; if (!use_calgary) @@ -1442,9 +1459,7 @@ void __init detect_calgary(void) printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", specified_table_size); - /* swiotlb for devices that aren't behind the Calgary. */ - if (max_pfn > MAX_DMA32_PFN) - swiotlb = 1; + x86_init.iommu.iommu_init = calgary_iommu_init; } return; @@ -1457,35 +1472,6 @@ cleanup: } } -int __init calgary_iommu_init(void) -{ - int ret; - - if (no_iommu || (swiotlb && !calgary_detected)) - return -ENODEV; - - if (!calgary_detected) - return -ENODEV; - - /* ok, we're trying to use Calgary - let's roll */ - printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); - - ret = calgary_init(); - if (ret) { - printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " - "falling back to no_iommu\n", ret); - return ret; - } - - force_iommu = 1; - bad_dma_address = 0x0; - /* dma_ops is set to swiotlb or nommu */ - if (!dma_ops) - dma_ops = &nommu_dma_ops; - - return 0; -} - static int __init calgary_parse_options(char *p) { unsigned int bridge; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index d71c8655905..afcc58b69c7 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -11,10 +11,11 @@ #include <asm/gart.h> #include <asm/calgary.h> #include <asm/amd_iommu.h> +#include <asm/x86_init.h> static int forbid_dac __read_mostly; -struct dma_map_ops *dma_ops; +struct dma_map_ops *dma_ops = &nommu_dma_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -35,22 +36,17 @@ int iommu_detected __read_mostly = 0; /* * This variable becomes 1 if iommu=pt is passed on the kernel command line. - * If this variable is 1, IOMMU implementations do no DMA ranslation for + * If this variable is 1, IOMMU implementations do no DMA translation for * devices and allow every device to access to whole physical memory. This is * useful if a user want to use an IOMMU only for KVM device assignment to * guests and not for driver dma translation. */ int iommu_pass_through __read_mostly; -dma_addr_t bad_dma_address __read_mostly = 0; -EXPORT_SYMBOL(bad_dma_address); - -/* Dummy device used for NULL arguments (normally ISA). Better would - be probably a smaller DMA mask, but this is bug-to-bug compatible - to older i386. */ +/* Dummy device used for NULL arguments (normally ISA). */ struct device x86_dma_fallback_dev = { .init_name = "fallback device", - .coherent_dma_mask = DMA_BIT_MASK(32), + .coherent_dma_mask = ISA_DMA_BIT_MASK, .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, }; EXPORT_SYMBOL(x86_dma_fallback_dev); @@ -128,20 +124,17 @@ void __init pci_iommu_alloc(void) /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #endif + if (pci_swiotlb_init()) + return; - /* - * The order of these functions is important for - * fall-back/fail-over reasons - */ gart_iommu_hole_init(); detect_calgary(); detect_intel_iommu(); + /* needs to be called after gart_iommu_hole_init */ amd_iommu_detect(); - - pci_swiotlb_init(); } void *dma_generic_alloc_coherent(struct device *dev, size_t size, @@ -216,7 +209,7 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "allowdac", 8)) forbid_dac = 0; if (!strncmp(p, "nodac", 5)) - forbid_dac = -1; + forbid_dac = 1; if (!strncmp(p, "usedac", 6)) { forbid_dac = -1; return 1; @@ -225,10 +218,8 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "soft", 4)) swiotlb = 1; #endif - if (!strncmp(p, "pt", 2)) { + if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; - return 1; - } gart_parse_options(p); @@ -293,27 +284,19 @@ static int __init pci_iommu_init(void) #ifdef CONFIG_PCI dma_debug_add_bus(&pci_bus_type); #endif + x86_init.iommu.iommu_init(); - calgary_iommu_init(); + if (swiotlb) { + printk(KERN_INFO "PCI-DMA: " + "Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_print_info(); + } else + swiotlb_free(); - intel_iommu_init(); - - amd_iommu_init(); - - gart_iommu_init(); - - no_iommu_init(); return 0; } - -void pci_iommu_shutdown(void) -{ - gart_iommu_shutdown(); - - amd_iommu_shutdown(); -} /* Must execute after PCI subsystem */ -fs_initcall(pci_iommu_init); +rootfs_initcall(pci_iommu_init); #ifdef CONFIG_PCI /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 98a827ee9ed..e6a0d402f17 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -16,6 +16,7 @@ #include <linux/agp_backend.h> #include <linux/init.h> #include <linux/mm.h> +#include <linux/sched.h> #include <linux/string.h> #include <linux/spinlock.h> #include <linux/pci.h> @@ -38,6 +39,7 @@ #include <asm/swiotlb.h> #include <asm/dma.h> #include <asm/k8.h> +#include <asm/x86_init.h> static unsigned long iommu_bus_base; /* GART remapping area (physical) */ static unsigned long iommu_size; /* size of remapping area bytes */ @@ -45,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */ static u32 *iommu_gatt_base; /* Remapping table */ +static dma_addr_t bad_dma_addr; + /* * If this is disabled the IOMMU will use an optimized flushing strategy * of only flushing when an mapping is reused. With it true the GART is @@ -91,7 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size, base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), PAGE_SIZE) >> PAGE_SHIFT; - boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, + boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; spin_lock_irqsave(&iommu_bitmap_lock, flags); @@ -215,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, if (panic_on_overflow) panic("dma_map_area overflow %lu bytes\n", size); iommu_full(dev, size, dir); - return bad_dma_address; + return bad_dma_addr; } for (i = 0; i < npages; i++) { @@ -293,7 +297,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, int i; #ifdef CONFIG_IOMMU_DEBUG - printk(KERN_DEBUG "dma_map_sg overflow\n"); + pr_debug("dma_map_sg overflow\n"); #endif for_each_sg(sg, s, nents, i) { @@ -301,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, if (nonforced_iommu(dev, addr, s->length)) { addr = dma_map_area(dev, addr, s->length, dir, 0); - if (addr == bad_dma_address) { + if (addr == bad_dma_addr) { if (i > 0) gart_unmap_sg(dev, sg, i, dir, NULL); nents = 0; @@ -388,12 +392,14 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, if (!dev) dev = &x86_dma_fallback_dev; - out = 0; - start = 0; - start_sg = sgmap = sg; - seg_size = 0; - max_seg_size = dma_get_max_seg_size(dev); - ps = NULL; /* shut up gcc */ + out = 0; + start = 0; + start_sg = sg; + sgmap = sg; + seg_size = 0; + max_seg_size = dma_get_max_seg_size(dev); + ps = NULL; /* shut up gcc */ + for_each_sg(sg, s, nents, i) { dma_addr_t addr = sg_phys(s); @@ -416,11 +422,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, sgmap, pages, need) < 0) goto error; out++; - seg_size = 0; - sgmap = sg_next(sgmap); - pages = 0; - start = i; - start_sg = s; + + seg_size = 0; + sgmap = sg_next(sgmap); + pages = 0; + start = i; + start_sg = s; } } @@ -454,7 +461,7 @@ error: iommu_full(dev, pages << PAGE_SHIFT, dir); for_each_sg(sg, s, nents, i) - s->dma_address = bad_dma_address; + s->dma_address = bad_dma_addr; return 0; } @@ -478,7 +485,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, DMA_BIDIRECTIONAL, align_mask); flush_gart(); - if (paddr != bad_dma_address) { + if (paddr != bad_dma_addr) { *dma_addr = paddr; return page_address(page); } @@ -498,6 +505,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); } +static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return (dma_addr == bad_dma_addr); +} + static int no_agp; static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) @@ -514,7 +526,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; if (iommu_size < 64*1024*1024) { - printk(KERN_WARNING + pr_warning( "PCI-DMA: Warning: Small IOMMU %luMB." " Consider increasing the AGP aperture in BIOS\n", iommu_size >> 20); @@ -569,28 +581,32 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc) aperture_alloc = aper_alloc; } -static int gart_resume(struct sys_device *dev) +static void gart_fixup_northbridges(struct sys_device *dev) { - printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); + int i; - if (fix_up_north_bridges) { - int i; + if (!fix_up_north_bridges) + return; - printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); + pr_info("PCI-DMA: Restoring GART aperture settings\n"); - for (i = 0; i < num_k8_northbridges; i++) { - struct pci_dev *dev = k8_northbridges[i]; + for (i = 0; i < num_k8_northbridges; i++) { + struct pci_dev *dev = k8_northbridges[i]; - /* - * Don't enable translations just yet. That is the next - * step. Restore the pre-suspend aperture settings. - */ - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, - aperture_order << 1); - pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, - aperture_alloc >> 25); - } + /* + * Don't enable translations just yet. That is the next + * step. Restore the pre-suspend aperture settings. + */ + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1); + pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); } +} + +static int gart_resume(struct sys_device *dev) +{ + pr_info("PCI-DMA: Resuming GART IOMMU\n"); + + gart_fixup_northbridges(dev); enable_gart_translations(); @@ -603,15 +619,14 @@ static int gart_suspend(struct sys_device *dev, pm_message_t state) } static struct sysdev_class gart_sysdev_class = { - .name = "gart", - .suspend = gart_suspend, - .resume = gart_resume, + .name = "gart", + .suspend = gart_suspend, + .resume = gart_resume, }; static struct sys_device device_gart = { - .id = 0, - .cls = &gart_sysdev_class, + .cls = &gart_sysdev_class, }; /* @@ -626,7 +641,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) void *gatt; int i, error; - printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); + pr_info("PCI-DMA: Disabling AGP.\n"); + aper_size = aper_base = info->aper_size = 0; dev = NULL; for (i = 0; i < num_k8_northbridges; i++) { @@ -644,6 +660,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) } if (!aper_base) goto nommu; + info->aper_base = aper_base; info->aper_size = aper_size >> 20; @@ -666,14 +683,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info) flush_gart(); - printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", + pr_info("PCI-DMA: aperture base @ %x size %u KB\n", aper_base, aper_size>>10); return 0; nommu: /* Should not happen anymore */ - printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" + pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n" "falling back to iommu=soft.\n"); return -1; } @@ -685,14 +702,15 @@ static struct dma_map_ops gart_dma_ops = { .unmap_page = gart_unmap_page, .alloc_coherent = gart_alloc_coherent, .free_coherent = gart_free_coherent, + .mapping_error = gart_mapping_error, }; -void gart_iommu_shutdown(void) +static void gart_iommu_shutdown(void) { struct pci_dev *dev; int i; - if (no_agp && (dma_ops != &gart_dma_ops)) + if (no_agp) return; for (i = 0; i < num_k8_northbridges; i++) { @@ -707,7 +725,7 @@ void gart_iommu_shutdown(void) } } -void __init gart_iommu_init(void) +int __init gart_iommu_init(void) { struct agp_kern_info info; unsigned long iommu_start; @@ -717,7 +735,7 @@ void __init gart_iommu_init(void) long i; if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) - return; + return 0; #ifndef CONFIG_AGP_AMD64 no_agp = 1; @@ -729,35 +747,28 @@ void __init gart_iommu_init(void) (agp_copy_info(agp_bridge, &info) < 0); #endif - if (swiotlb) - return; - - /* Did we detect a different HW IOMMU? */ - if (iommu_detected && !gart_iommu_aperture) - return; - if (no_iommu || (!force_iommu && max_pfn <= MAX_DMA32_PFN) || !gart_iommu_aperture || (no_agp && init_k8_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { - printk(KERN_WARNING "More than 4GB of memory " - "but GART IOMMU not available.\n"); - printk(KERN_WARNING "falling back to iommu=soft.\n"); + pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); + pr_warning("falling back to iommu=soft.\n"); } - return; + return 0; } /* need to map that range */ - aper_size = info.aper_size << 20; - aper_base = info.aper_base; - end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + aper_size = info.aper_size << 20; + aper_base = info.aper_base; + end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + if (end_pfn > max_low_pfn_mapped) { start_pfn = (aper_base>>PAGE_SHIFT); init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); } - printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); + pr_info("PCI-DMA: using GART IOMMU.\n"); iommu_size = check_iommu_size(info.aper_base, aper_size); iommu_pages = iommu_size >> PAGE_SHIFT; @@ -772,8 +783,7 @@ void __init gart_iommu_init(void) ret = dma_debug_resize_entries(iommu_pages); if (ret) - printk(KERN_DEBUG - "PCI-DMA: Cannot trace all the entries\n"); + pr_debug("PCI-DMA: Cannot trace all the entries\n"); } #endif @@ -783,15 +793,14 @@ void __init gart_iommu_init(void) */ iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); - agp_memory_reserved = iommu_size; - printk(KERN_INFO - "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", + pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", iommu_size >> 20); - iommu_start = aper_size - iommu_size; - iommu_bus_base = info.aper_base + iommu_start; - bad_dma_address = iommu_bus_base; - iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + agp_memory_reserved = iommu_size; + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; + bad_dma_addr = iommu_bus_base; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); /* * Unmap the IOMMU part of the GART. The alias of the page is @@ -813,7 +822,7 @@ void __init gart_iommu_init(void) * the pages as Not-Present: */ wbinvd(); - + /* * Now all caches are flushed and we can safely enable * GART hardware. Doing it early leaves the possibility @@ -837,6 +846,10 @@ void __init gart_iommu_init(void) flush_gart(); dma_ops = &gart_dma_ops; + x86_platform.iommu_shutdown = gart_iommu_shutdown; + swiotlb = 0; + + return 0; } void __init gart_parse_options(char *p) @@ -855,7 +868,7 @@ void __init gart_parse_options(char *p) #endif if (isdigit(*p) && get_option(&p, &arg)) iommu_size = arg; - if (!strncmp(p, "fullflush", 8)) + if (!strncmp(p, "fullflush", 9)) iommu_fullflush = 1; if (!strncmp(p, "nofullflush", 11)) iommu_fullflush = 0; diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index a3933d4330c..22be12b60a8 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, dma_addr_t bus = page_to_phys(page) + offset; WARN_ON(size == 0); if (!check_addr("map_single", dev, bus, size)) - return bad_dma_address; + return DMA_ERROR_CODE; flush_write_buffers(); return bus; } @@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = { .sync_sg_for_device = nommu_sync_sg_for_device, .is_phys = 1, }; - -void __init no_iommu_init(void) -{ - if (dma_ops) - return; - - force_iommu = 0; /* no HW IOMMU */ - dma_ops = &nommu_dma_ops; -} diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index e8a35016115..e3c0a66b9e7 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -42,19 +42,28 @@ static struct dma_map_ops swiotlb_dma_ops = { .dma_supported = NULL, }; -void __init pci_swiotlb_init(void) +/* + * pci_swiotlb_init - initialize swiotlb if necessary + * + * This returns non-zero if we are forced to use swiotlb (by the boot + * option). + */ +int __init pci_swiotlb_init(void) { + int use_swiotlb = swiotlb | swiotlb_force; + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 - if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || - iommu_pass_through) - swiotlb = 1; + if (!no_iommu && max_pfn > MAX_DMA32_PFN) + swiotlb = 1; #endif if (swiotlb_force) swiotlb = 1; + if (swiotlb) { - printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); - swiotlb_init(); + swiotlb_init(0); dma_ops = &swiotlb_dma_ops; } + + return use_swiotlb; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 071166a4ba8..744508e7cfd 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -9,7 +9,8 @@ #include <linux/pm.h> #include <linux/clockchips.h> #include <linux/random.h> -#include <trace/power.h> +#include <trace/events/power.h> +#include <linux/hw_breakpoint.h> #include <asm/system.h> #include <asm/apic.h> #include <asm/syscalls.h> @@ -17,6 +18,7 @@ #include <asm/uaccess.h> #include <asm/i387.h> #include <asm/ds.h> +#include <asm/debugreg.h> unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -25,9 +27,6 @@ EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; -DEFINE_TRACE(power_start); -DEFINE_TRACE(power_end); - int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; @@ -106,14 +105,7 @@ void flush_thread(void) } #endif - clear_tsk_thread_flag(tsk, TIF_DEBUG); - - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; - tsk->thread.debugreg3 = 0; - tsk->thread.debugreg6 = 0; - tsk->thread.debugreg7 = 0; + flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. @@ -195,16 +187,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); - if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg0, 0); - set_debugreg(next->debugreg1, 1); - set_debugreg(next->debugreg2, 2); - set_debugreg(next->debugreg3, 3); - /* no 4 and 5 */ - set_debugreg(next->debugreg6, 6); - set_debugreg(next->debugreg7, 7); - } - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ test_tsk_thread_flag(next_p, TIF_NOTSC)) { /* prev and next are different */ @@ -299,9 +281,7 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -314,7 +294,6 @@ void default_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; - trace_power_end(&it); } else { local_irq_enable(); /* loop is done by the caller */ @@ -372,9 +351,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); + trace_power_start(POWER_CSTATE, (ax>>4)+1); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -384,15 +361,13 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) if (!need_resched()) __mwait(ax, cx); } - trace_power_end(&it); } /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { - struct power_trace it; if (!need_resched()) { - trace_power_start(&it, POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -402,7 +377,6 @@ static void mwait_idle(void) __sti_mwait(0, 0); else local_irq_enable(); - trace_power_end(&it); } else local_irq_enable(); } @@ -414,13 +388,11 @@ static void mwait_idle(void) */ static void poll_idle(void) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, 0); + trace_power_start(POWER_CSTATE, 0); local_irq_enable(); while (!need_resched()) cpu_relax(); - trace_power_end(&it); + trace_power_end(0); } /* @@ -568,10 +540,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) void __init init_c1e_mask(void) { /* If we're using c1e_idle, we need to allocate c1e_mask. */ - if (pm_idle == c1e_idle) { - alloc_cpumask_var(&c1e_mask, GFP_KERNEL); - cpumask_clear(c1e_mask); - } + if (pm_idle == c1e_idle) + zalloc_cpumask_var(&c1e_mask, GFP_KERNEL); } static int __init idle_setup(char *str) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4cf79567cda..540140284f6 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -58,6 +58,7 @@ #include <asm/idle.h> #include <asm/syscalls.h> #include <asm/ds.h> +#include <asm/debugreg.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -134,7 +135,7 @@ void __show_regs(struct pt_regs *regs, int all) ss = regs->ss & 0xffff; gs = get_user_gs(regs); } else { - sp = (unsigned long) (®s->sp); + sp = kernel_stack_pointer(regs); savesegment(ss, ss); savesegment(gs, gs); } @@ -259,7 +260,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, task_user_gs(p) = get_user_gs(regs); + p->thread.io_bitmap_ptr = NULL; tsk = current; + err = -ENOMEM; + + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, IO_BITMAP_BYTES, GFP_KERNEL); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ad535b68317..70cf15873f3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,6 +52,7 @@ #include <asm/idle.h> #include <asm/syscalls.h> #include <asm/ds.h> +#include <asm/debugreg.h> asmlinkage extern void ret_from_fork(void); @@ -297,12 +298,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.fs = me->thread.fs; p->thread.gs = me->thread.gs; + p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); savesegment(fs, p->thread.fsindex); savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); + err = -ENOMEM; + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { @@ -341,6 +346,7 @@ out: kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + return err; } @@ -495,6 +501,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ if (preload_fpu) __math_state_restore(); + return prev_p; } @@ -664,3 +671,8 @@ long sys_arch_prctl(int code, unsigned long addr) return do_arch_prctl(current, code, addr); } +unsigned long KSTK_ESP(struct task_struct *task) +{ + return (test_tsk_thread_flag(task, TIF_IA32)) ? + (task_pt_regs(task)->sp) : ((task)->thread.usersp); +} diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 8d7d5c9c1be..04d182a7cfd 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -22,6 +22,8 @@ #include <linux/seccomp.h> #include <linux/signal.h> #include <linux/workqueue.h> +#include <linux/perf_event.h> +#include <linux/hw_breakpoint.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -34,6 +36,7 @@ #include <asm/prctl.h> #include <asm/proto.h> #include <asm/ds.h> +#include <asm/hw_breakpoint.h> #include "tls.h" @@ -49,6 +52,118 @@ enum x86_regset { REGSET_IOPERM32, }; +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +static const struct pt_regs_offset regoffset_table[] = { +#ifdef CONFIG_X86_64 + REG_OFFSET_NAME(r15), + REG_OFFSET_NAME(r14), + REG_OFFSET_NAME(r13), + REG_OFFSET_NAME(r12), + REG_OFFSET_NAME(r11), + REG_OFFSET_NAME(r10), + REG_OFFSET_NAME(r9), + REG_OFFSET_NAME(r8), +#endif + REG_OFFSET_NAME(bx), + REG_OFFSET_NAME(cx), + REG_OFFSET_NAME(dx), + REG_OFFSET_NAME(si), + REG_OFFSET_NAME(di), + REG_OFFSET_NAME(bp), + REG_OFFSET_NAME(ax), +#ifdef CONFIG_X86_32 + REG_OFFSET_NAME(ds), + REG_OFFSET_NAME(es), + REG_OFFSET_NAME(fs), + REG_OFFSET_NAME(gs), +#endif + REG_OFFSET_NAME(orig_ax), + REG_OFFSET_NAME(ip), + REG_OFFSET_NAME(cs), + REG_OFFSET_NAME(flags), + REG_OFFSET_NAME(sp), + REG_OFFSET_NAME(ss), + REG_OFFSET_END, +}; + +/** + * regs_query_register_offset() - query register offset from its name + * @name: the name of a register + * + * regs_query_register_offset() returns the offset of a register in struct + * pt_regs from its name. If the name is invalid, this returns -EINVAL; + */ +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} + +/** + * regs_query_register_name() - query register name from its offset + * @offset: the offset of a register in struct pt_regs. + * + * regs_query_register_name() returns the name of a register from its + * offset in struct pt_regs. If the @offset is invalid, this returns NULL; + */ +const char *regs_query_register_name(unsigned int offset) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (roff->offset == offset) + return roff->name; + return NULL; +} + +static const int arg_offs_table[] = { +#ifdef CONFIG_X86_32 + [0] = offsetof(struct pt_regs, ax), + [1] = offsetof(struct pt_regs, dx), + [2] = offsetof(struct pt_regs, cx) +#else /* CONFIG_X86_64 */ + [0] = offsetof(struct pt_regs, di), + [1] = offsetof(struct pt_regs, si), + [2] = offsetof(struct pt_regs, dx), + [3] = offsetof(struct pt_regs, cx), + [4] = offsetof(struct pt_regs, r8), + [5] = offsetof(struct pt_regs, r9) +#endif +}; + +/** + * regs_get_argument_nth() - get Nth argument at function call + * @regs: pt_regs which contains registers at function entry. + * @n: argument number. + * + * regs_get_argument_nth() returns @n th argument of a function call. + * Since usually the kernel stack will be changed right after function entry, + * you must use this at function entry. If the @n th entry is NOT in the + * kernel stack or pt_regs, this returns 0. + */ +unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n) +{ + if (n < ARRAY_SIZE(arg_offs_table)) + return *(unsigned long *)((char *)regs + arg_offs_table[n]); + else { + /* + * The typical case: arg n is on the stack. + * (Note: stack[0] = return address, so skip it) + */ + n -= ARRAY_SIZE(arg_offs_table); + return regs_get_kernel_stack_nth(regs, 1 + n); + } +} + /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. @@ -137,11 +252,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long debugreg_addr_limit(struct task_struct *task) -{ - return TASK_SIZE - 3; -} - #else /* CONFIG_X86_64 */ #define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) @@ -266,15 +376,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long debugreg_addr_limit(struct task_struct *task) -{ -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) - return IA32_PAGE_OFFSET - 3; -#endif - return TASK_SIZE_MAX - 7; -} - #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) @@ -325,16 +426,6 @@ static int putreg(struct task_struct *child, return set_flags(child, value); #ifdef CONFIG_X86_64 - /* - * Orig_ax is really just a flag with small positive and - * negative values, so make sure to always sign-extend it - * from 32 bits so that it works correctly regardless of - * whether we come from a 32-bit environment or not. - */ - case offsetof(struct user_regs_struct, orig_ax): - value = (long) (s32) value; - break; - case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_OF(child)) return -EIO; @@ -464,99 +555,239 @@ static int genregs_set(struct task_struct *target, return ret; } +static void ptrace_triggered(struct perf_event *bp, void *data) +{ + int i; + struct thread_struct *thread = &(current->thread); + + /* + * Store in the virtual DR6 register the fact that the breakpoint + * was hit so the thread's debugger will see it. + */ + for (i = 0; i < HBP_NUM; i++) { + if (thread->ptrace_bps[i] == bp) + break; + } + + thread->debugreg6 |= (DR_TRAP0 << i); +} + /* - * This function is trivial and will be inlined by the compiler. - * Having it separates the implementation details of debug - * registers from the interface details of ptrace. + * Walk through every ptrace breakpoints for this thread and + * build the dr7 value on top of their attributes. + * */ -static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) +static unsigned long ptrace_get_dr7(struct perf_event *bp[]) { - switch (n) { - case 0: return child->thread.debugreg0; - case 1: return child->thread.debugreg1; - case 2: return child->thread.debugreg2; - case 3: return child->thread.debugreg3; - case 6: return child->thread.debugreg6; - case 7: return child->thread.debugreg7; + int i; + int dr7 = 0; + struct arch_hw_breakpoint *info; + + for (i = 0; i < HBP_NUM; i++) { + if (bp[i] && !bp[i]->attr.disabled) { + info = counter_arch_bp(bp[i]); + dr7 |= encode_dr7(i, info->len, info->type); + } } - return 0; + + return dr7; } -static int ptrace_set_debugreg(struct task_struct *child, - int n, unsigned long data) +static struct perf_event * +ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, + struct task_struct *tsk, int disabled) { - int i; + int err; + int gen_len, gen_type; + DEFINE_BREAKPOINT_ATTR(attr); - if (unlikely(n == 4 || n == 5)) - return -EIO; + /* + * We shoud have at least an inactive breakpoint at this + * slot. It means the user is writing dr7 without having + * written the address register first + */ + if (!bp) + return ERR_PTR(-EINVAL); - if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) - return -EIO; + err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); + if (err) + return ERR_PTR(err); - switch (n) { - case 0: child->thread.debugreg0 = data; break; - case 1: child->thread.debugreg1 = data; break; - case 2: child->thread.debugreg2 = data; break; - case 3: child->thread.debugreg3 = data; break; + attr = bp->attr; + attr.bp_len = gen_len; + attr.bp_type = gen_type; + attr.disabled = disabled; - case 6: - if ((data & ~0xffffffffUL) != 0) - return -EIO; - child->thread.debugreg6 = data; - break; + return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); +} + +/* + * Handle ptrace writes to debug register 7. + */ +static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) +{ + struct thread_struct *thread = &(tsk->thread); + unsigned long old_dr7; + int i, orig_ret = 0, rc = 0; + int enabled, second_pass = 0; + unsigned len, type; + struct perf_event *bp; + + data &= ~DR_CONTROL_RESERVED; + old_dr7 = ptrace_get_dr7(thread->ptrace_bps); +restore: + /* + * Loop through all the hardware breakpoints, making the + * appropriate changes to each. + */ + for (i = 0; i < HBP_NUM; i++) { + enabled = decode_dr7(data, i, &len, &type); + bp = thread->ptrace_bps[i]; + + if (!enabled) { + if (bp) { + /* + * Don't unregister the breakpoints right-away, + * unless all register_user_hw_breakpoint() + * requests have succeeded. This prevents + * any window of opportunity for debug + * register grabbing by other users. + */ + if (!second_pass) + continue; + + thread->ptrace_bps[i] = NULL; + bp = ptrace_modify_breakpoint(bp, len, type, + tsk, 1); + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + thread->ptrace_bps[i] = NULL; + break; + } + thread->ptrace_bps[i] = bp; + } + continue; + } - case 7: + bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0); + + /* Incorrect bp, or we have a bug in bp API */ + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + thread->ptrace_bps[i] = NULL; + break; + } + thread->ptrace_bps[i] = bp; + } + /* + * Make a second pass to free the remaining unused breakpoints + * or to restore the original breakpoints if an error occurred. + */ + if (!second_pass) { + second_pass = 1; + if (rc < 0) { + orig_ret = rc; + data = old_dr7; + } + goto restore; + } + return ((orig_ret < 0) ? orig_ret : rc); +} + +/* + * Handle PTRACE_PEEKUSR calls for the debug register area. + */ +static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) +{ + struct thread_struct *thread = &(tsk->thread); + unsigned long val = 0; + + if (n < HBP_NUM) { + struct perf_event *bp; + bp = thread->ptrace_bps[n]; + if (!bp) + return 0; + val = bp->hw.info.address; + } else if (n == 6) { + val = thread->debugreg6; + } else if (n == 7) { + val = ptrace_get_dr7(thread->ptrace_bps); + } + return val; +} + +static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, + unsigned long addr) +{ + struct perf_event *bp; + struct thread_struct *t = &tsk->thread; + DEFINE_BREAKPOINT_ATTR(attr); + + if (!t->ptrace_bps[nr]) { /* - * Sanity-check data. Take one half-byte at once with - * check = (val >> (16 + 4*i)) & 0xf. It contains the - * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits - * 2 and 3 are LENi. Given a list of invalid values, - * we do mask |= 1 << invalid_value, so that - * (mask >> check) & 1 is a correct test for invalid - * values. - * - * R/Wi contains the type of the breakpoint / - * watchpoint, LENi contains the length of the watched - * data in the watchpoint case. - * - * The invalid values are: - * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit] - * - R/Wi == 0x10 (break on I/O reads or writes), so - * mask |= 0x4444. - * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= - * 0x1110. - * - * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. - * - * See the Intel Manual "System Programming Guide", - * 15.2.4 - * - * Note that LENi == 0x10 is defined on x86_64 in long - * mode (i.e. even for 32-bit userspace software, but - * 64-bit kernel), so the x86_64 mask value is 0x5454. - * See the AMD manual no. 24593 (AMD64 System Programming) + * Put stub len and type to register (reserve) an inactive but + * correct bp */ -#ifdef CONFIG_X86_32 -#define DR7_MASK 0x5f54 -#else -#define DR7_MASK 0x5554 -#endif - data &= ~DR_CONTROL_RESERVED; - for (i = 0; i < 4; i++) - if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) - return -EIO; - child->thread.debugreg7 = data; - if (data) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - break; + attr.bp_addr = addr; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_W; + attr.disabled = 1; + + bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); + } else { + bp = t->ptrace_bps[nr]; + t->ptrace_bps[nr] = NULL; + + attr = bp->attr; + attr.bp_addr = addr; + bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); } + /* + * CHECKME: the previous code returned -EIO if the addr wasn't a + * valid task virtual addr. The new one will return -EINVAL in this + * case. + * -EINVAL may be what we want for in-kernel breakpoints users, but + * -EIO looks better for ptrace, since we refuse a register writing + * for the user. And anyway this is the previous behaviour. + */ + if (IS_ERR(bp)) + return PTR_ERR(bp); + + t->ptrace_bps[nr] = bp; return 0; } /* + * Handle PTRACE_POKEUSR calls for the debug register area. + */ +int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) +{ + struct thread_struct *thread = &(tsk->thread); + int rc = 0; + + /* There are no DR4 or DR5 registers */ + if (n == 4 || n == 5) + return -EIO; + + if (n == 6) { + thread->debugreg6 = val; + goto ret_path; + } + if (n < HBP_NUM) { + rc = ptrace_set_breakpoint_addr(tsk, n, val); + if (rc) + return rc; + } + /* All that's left is DR7 */ + if (n == 7) + rc = ptrace_write_dr7(tsk, val); + +ret_path: + return rc; +} + +/* * These access the current or another (stopped) task's io permission * bitmap for debugging or core dump. */ @@ -1126,10 +1357,15 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) case offsetof(struct user32, regs.orig_eax): /* - * Sign-extend the value so that orig_eax = -1 - * causes (long)orig_ax < 0 tests to fire correctly. + * A 32-bit debugger setting orig_eax means to restore + * the state of the task restarting a 32-bit syscall. + * Make sure we interpret the -ERESTART* codes correctly + * in case the task is not actually still sitting at the + * exit from a 32-bit syscall with TS_COMPAT still set. */ - regs->orig_ax = (long) (s32) value; + regs->orig_ax = value; + if (syscall_get_nr(child, regs) >= 0) + task_thread_info(child)->status |= TS_COMPAT; break; case offsetof(struct user32, regs.eflags): diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index af71d06624b..6c3b2c6fd77 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -508,7 +508,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev) pci_read_config_dword(nb_ht, 0x60, &val); set_dev_node(&dev->dev, val & 7); - pci_dev_put(dev); + pci_dev_put(nb_ht); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a06e8d10184..2b97fc5b124 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -4,6 +4,8 @@ #include <linux/pm.h> #include <linux/efi.h> #include <linux/dmi.h> +#include <linux/sched.h> +#include <linux/tboot.h> #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> @@ -21,7 +23,7 @@ # include <linux/ctype.h> # include <linux/mc146818rtc.h> #else -# include <asm/iommu.h> +# include <asm/x86_init.h> #endif /* @@ -434,6 +436,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), }, }, + { /* Handle problems with rebooting on Apple Macmini3,1 */ + .callback = set_pci_reboot, + .ident = "Apple Macmini3,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), + }, + }, { } }; @@ -508,6 +518,8 @@ static void native_machine_emergency_restart(void) if (reboot_emergency) emergency_vmx_disable_all(); + tboot_shutdown(TB_SHUTDOWN_REBOOT); + /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; @@ -610,7 +622,7 @@ void native_machine_shutdown(void) #endif #ifdef CONFIG_X86_64 - pci_iommu_shutdown(); + x86_platform.iommu_shutdown(); #endif } @@ -634,6 +646,8 @@ static void native_machine_halt(void) /* stop other cpus and apics */ machine_shutdown(); + tboot_shutdown(TB_SHUTDOWN_HALT); + /* stop this cpu */ stop_this_cpu(NULL); } @@ -645,6 +659,8 @@ static void native_machine_power_off(void) machine_shutdown(); pm_power_off(); } + /* a fallback in case there is no PM info available */ + tboot_shutdown(TB_SHUTDOWN_HALT); } struct machine_ops machine_ops = { diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 5d465b207e7..1cfbbfc3ae2 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -8,6 +8,7 @@ #include <linux/pnp.h> #include <asm/vsyscall.h> +#include <asm/x86_init.h> #include <asm/time.h> #ifdef CONFIG_X86_32 @@ -165,33 +166,29 @@ void rtc_cmos_write(unsigned char val, unsigned char addr) } EXPORT_SYMBOL(rtc_cmos_write); -static int set_rtc_mmss(unsigned long nowtime) +int update_persistent_clock(struct timespec now) { unsigned long flags; int retval; spin_lock_irqsave(&rtc_lock, flags); - retval = set_wallclock(nowtime); + retval = x86_platform.set_wallclock(now.tv_sec); spin_unlock_irqrestore(&rtc_lock, flags); return retval; } /* not static: needed by APM */ -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned long retval, flags; spin_lock_irqsave(&rtc_lock, flags); - retval = get_wallclock(); + retval = x86_platform.get_wallclock(); spin_unlock_irqrestore(&rtc_lock, flags); - return retval; -} - -int update_persistent_clock(struct timespec now) -{ - return set_rtc_mmss(now.tv_sec); + ts->tv_sec = retval; + ts->tv_nsec = 0; } unsigned long long native_read_tsc(void) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 63f32d220ef..82e88cdda9b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -27,6 +27,7 @@ #include <linux/screen_info.h> #include <linux/ioport.h> #include <linux/acpi.h> +#include <linux/sfi.h> #include <linux/apm_bios.h> #include <linux/initrd.h> #include <linux/bootmem.h> @@ -66,6 +67,7 @@ #include <linux/percpu.h> #include <linux/crash_dump.h> +#include <linux/tboot.h> #include <video/edid.h> @@ -107,10 +109,7 @@ #ifdef CONFIG_X86_64 #include <asm/numa_64.h> #endif - -#ifndef ARCH_SETUP -#define ARCH_SETUP -#endif +#include <asm/mce.h> /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -133,9 +132,9 @@ int default_cpu_present_to_apicid(int mps_cpu) return __default_cpu_present_to_apicid(mps_cpu); } -int default_check_phys_apicid_present(int boot_cpu_physical_apicid) +int default_check_phys_apicid_present(int phys_apicid) { - return __default_check_phys_apicid_present(boot_cpu_physical_apicid); + return __default_check_phys_apicid_present(phys_apicid); } #endif @@ -171,13 +170,6 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - /* cpu data as detected by the assembly code in head.S */ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; /* common cpu data for all cpus */ @@ -256,7 +248,7 @@ EXPORT_SYMBOL(edd); * from boot_params into a safe place. * */ -static inline void copy_edd(void) +static inline void __init copy_edd(void) { memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, sizeof(edd.mbr_signature)); @@ -265,7 +257,7 @@ static inline void copy_edd(void) edd.edd_info_nr = boot_params.eddbuf_entries; } #else -static inline void copy_edd(void) +static inline void __init copy_edd(void) { } #endif @@ -605,7 +597,7 @@ static struct resource standard_io_resources[] = { .flags = IORESOURCE_BUSY | IORESOURCE_IO } }; -static void __init reserve_standard_io_resources(void) +void __init reserve_standard_io_resources(void) { int i; @@ -637,10 +629,6 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif -static struct x86_quirks default_x86_quirks __initdata; - -struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; - #ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { @@ -673,6 +661,13 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { }, }, { + .callback = dmi_low_memory_corruption, + .ident = "Phoenix/MSC BIOS", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), + }, + }, + { /* * AMI BIOS with low memory corruption was found on Intel DG45ID board. * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will @@ -757,7 +752,7 @@ void __init setup_arch(char **cmdline_p) } #endif - ARCH_SETUP + x86_init.oem.arch_setup(); setup_memory_map(); parse_setup_data(); @@ -796,6 +791,16 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; +#ifdef CONFIG_X86_64 + /* + * Must call this twice: Once just to detect whether hardware doesn't + * support NX (so that the early EHCI debug console setup can safely + * call set_fixmap(), and then again after parsing early parameters to + * honor the respective command line option. + */ + check_efer(); +#endif + parse_early_param(); #ifdef CONFIG_X86_64 @@ -833,11 +838,9 @@ void __init setup_arch(char **cmdline_p) * VMware detection requires dmi to be available, so this * needs to be done after dmi_scan_machine, for the BP. */ - init_hypervisor(&boot_cpu_data); + init_hypervisor_platform(); -#ifdef CONFIG_X86_32 - probe_roms(); -#endif + x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ insert_resource(&iomem_resource, &code_resource); @@ -972,10 +975,11 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif - paravirt_pagetable_setup_start(swapper_pg_dir); + x86_init.paging.pagetable_setup_start(swapper_pg_dir); paging_init(); - paravirt_pagetable_setup_done(swapper_pg_dir); - paravirt_post_allocator_init(); + x86_init.paging.pagetable_setup_done(swapper_pg_dir); + + tboot_probe(); #ifdef CONFIG_X86_64 map_vsyscall(); @@ -990,13 +994,13 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_init(); -#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) + sfi_init(); + /* * get boot-time SMP configuration: */ if (smp_found_config) get_smp_config(); -#endif prefill_possible_map(); @@ -1015,10 +1019,7 @@ void __init setup_arch(char **cmdline_p) e820_reserve_resources(); e820_mark_nosave_regions(max_low_pfn); -#ifdef CONFIG_X86_32 - request_resource(&iomem_resource, &video_ram_resource); -#endif - reserve_standard_io_resources(); + x86_init.resources.reserve_resources(); e820_setup_gap(); @@ -1030,78 +1031,24 @@ void __init setup_arch(char **cmdline_p) conswitchp = &dummy_con; #endif #endif -} + x86_init.oem.banner(); -#ifdef CONFIG_X86_32 - -/** - * x86_quirk_intr_init - post gate setup interrupt initialisation - * - * Description: - * Fill in any interrupts that may have been left out by the general - * init_IRQ() routine. interrupts having to do with the machine rather - * than the devices on the I/O bus (like APIC interrupts in intel MP - * systems) are started here. - **/ -void __init x86_quirk_intr_init(void) -{ - if (x86_quirks->arch_intr_init) { - if (x86_quirks->arch_intr_init()) - return; - } + mcheck_init(); } -/** - * x86_quirk_trap_init - initialise system specific traps - * - * Description: - * Called as the final act of trap_init(). Used in VISWS to initialise - * the various board specific APIC traps. - **/ -void __init x86_quirk_trap_init(void) -{ - if (x86_quirks->arch_trap_init) { - if (x86_quirks->arch_trap_init()) - return; - } -} +#ifdef CONFIG_X86_32 -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, - .name = "timer" +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; -/** - * x86_quirk_pre_time_init - do any specific initialisations before. - * - **/ -void __init x86_quirk_pre_time_init(void) +void __init i386_reserve_resources(void) { - if (x86_quirks->arch_pre_time_init) - x86_quirks->arch_pre_time_init(); + request_resource(&iomem_resource, &video_ram_resource); + reserve_standard_io_resources(); } -/** - * x86_quirk_time_init - do any specific initialisations for the system timer. - * - * Description: - * Must plug the system timer interrupt source at HZ into the IRQ listed - * in irq_vectors.h:TIMER_IRQ - **/ -void __init x86_quirk_time_init(void) -{ - if (x86_quirks->arch_time_init) { - /* - * A nonzero return code does not mean failure, it means - * that the architecture quirk does not want any - * generic (timer) setup to be performed after this: - */ - if (x86_quirks->arch_time_init()) - return; - } - - irq0.mask = cpumask_of_cpu(0); - setup_irq(0, &irq0); -} #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 07d81916f21..d559af913e1 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset); #define PERCPU_FIRST_CHUNK_RESERVE 0 #endif +#ifdef CONFIG_X86_32 /** * pcpu_need_numa - determine percpu allocation needs to consider NUMA * @@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void) #endif return false; } +#endif /** * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu @@ -124,308 +126,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } /* - * Large page remap allocator - * - * This allocator uses PMD page as unit. A PMD page is allocated for - * each cpu and each is remapped into vmalloc area using PMD mapping. - * As PMD page is quite large, only part of it is used for the first - * chunk. Unused part is returned to the bootmem allocator. - * - * So, the PMD pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more PMD TLB entry pressure but still is much - * better than only using 4k mappings while still being NUMA friendly. + * Helpers for first chunk memory allocation */ -#ifdef CONFIG_NEED_MULTIPLE_NODES -struct pcpul_ent { - unsigned int cpu; - void *ptr; -}; - -static size_t pcpul_size; -static struct pcpul_ent *pcpul_map; -static struct vm_struct pcpul_vm; - -static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) { - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpul_size) - return NULL; - - return virt_to_page(pcpul_map[cpu].ptr + off); + return pcpu_alloc_bootmem(cpu, size, align); } -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) +static void __init pcpu_fc_free(void *ptr, size_t size) { - size_t map_size, dyn_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - if (!chosen) { - size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = nr_cpu_ids * PMD_SIZE; - - /* on non-NUMA, embedding is better */ - if (!pcpu_need_numa()) - return -EINVAL; - - /* don't consume more than 20% of vmalloc area */ - if (tot_size > vm_size / 5) { - pr_info("PERCPU: too large chunk size %zuMB for " - "large page remap\n", tot_size >> 20); - return -EINVAL; - } - } - - /* need PSE */ - if (!cpu_has_pse) { - pr_warning("PERCPU: lpage allocator requires PSE\n"); - return -EINVAL; - } - - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - if (pcpul_size > PMD_SIZE) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } - dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; - - /* allocate pointer array and alloc large pages */ - map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0])); - pcpul_map = alloc_bootmem(map_size); - - for_each_possible_cpu(cpu) { - pcpul_map[cpu].cpu = cpu; - pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, - PMD_SIZE); - if (!pcpul_map[cpu].ptr) { - pr_warning("PERCPU: failed to allocate large page " - "for cpu%u\n", cpu); - goto enomem; - } - - /* - * Only use pcpul_size bytes and give back the rest. - * - * Ingo: The 2MB up-rounding bootmem is needed to make - * sure the partial 2MB page is still fully RAM - it's - * not well-specified to have a PAT-incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), - PMD_SIZE - pcpul_size); - - memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); - } - - /* allocate address and map */ - pcpul_vm.flags = VM_ALLOC; - pcpul_vm.size = nr_cpu_ids * PMD_SIZE; - vm_area_register_early(&pcpul_vm, PMD_SIZE); - - for_each_possible_cpu(cpu) { - pmd_t *pmd, pmd_v; - - pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + - cpu * PMD_SIZE); - pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), - PAGE_KERNEL_LARGE); - set_pmd(pmd, pmd_v); - } - - /* we're ready, commit */ - pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", pcpul_vm.addr, static_size); - - ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, pcpul_vm.addr, NULL); - - /* sort pcpul_map array for pcpu_lpage_remapped() */ - for (i = 0; i < nr_cpu_ids - 1; i++) - for (j = i + 1; j < nr_cpu_ids; j++) - if (pcpul_map[i].ptr > pcpul_map[j].ptr) { - struct pcpul_ent tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } - - return ret; - -enomem: - for_each_possible_cpu(cpu) - if (pcpul_map[cpu].ptr) - free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); - free_bootmem(__pa(pcpul_map), map_size); - return -ENOMEM; + free_bootmem(__pa(ptr), size); } -/** - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area - * @kaddr: the kernel address in question - * - * Determine whether @kaddr falls in the pcpul recycled area. This is - * used by pageattr to detect VM aliases and break up the pcpu PMD - * mapping such that the same physical page is not mapped under - * different attributes. - * - * The recycled area is always at the tail of a partially used PMD - * page. - * - * RETURNS: - * Address of corresponding remapped pcpu address if match is found; - * otherwise, NULL. - */ -void *pcpu_lpage_remapped(void *kaddr) +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { - void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); - unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; - int left = 0, right = nr_cpu_ids - 1; - int pos; - - /* pcpul in use at all? */ - if (!pcpul_map) - return NULL; - - /* okay, perform binary search */ - while (left <= right) { - pos = (left + right) / 2; - - if (pcpul_map[pos].ptr < pmd_addr) - left = pos + 1; - else if (pcpul_map[pos].ptr > pmd_addr) - right = pos - 1; - else { - /* it shouldn't be in the area for the first chunk */ - WARN_ON(offset < pcpul_size); - - return pcpul_vm.addr + - pcpul_map[pos].cpu * PMD_SIZE + offset; - } - } - - return NULL; -} +#ifdef CONFIG_NEED_MULTIPLE_NODES + if (early_cpu_to_node(from) == early_cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; #else -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) -{ - return -EINVAL; -} + return LOCAL_DISTANCE; #endif - -/* - * Embedding allocator - * - * The first chunk is sized to just contain the static area plus - * module and dynamic reserves and embedded into linear physical - * mapping so that it can use PMD mapping without additional TLB - * pressure. - */ -static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) -{ - size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; - - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, embedding allocation doesn't play well with - * NUMA. - */ - if (!chosen && (!cpu_has_pse || pcpu_need_numa())) - return -EINVAL; - - return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); } -/* - * 4k page allocator - * - * This is the basic allocator. Static percpu area is allocated - * page-by-page and most of initialization is done by the generic - * setup function. - */ -static struct page **pcpu4k_pages __initdata; -static int pcpu4k_nr_static_pages __initdata; - -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) -{ - if (pageno < pcpu4k_nr_static_pages) - return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; - return NULL; -} - -static void __init pcpu4k_populate_pte(unsigned long addr) +static void __init pcpup_populate_pte(unsigned long addr) { populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_4k(size_t static_size) -{ - size_t pages_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - pcpu4k_nr_static_pages = PFN_UP(static_size); - - /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids - * sizeof(pcpu4k_pages[0])); - pcpu4k_pages = alloc_bootmem(pages_size); - - /* allocate and copy */ - j = 0; - for_each_possible_cpu(cpu) - for (i = 0; i < pcpu4k_nr_static_pages; i++) { - void *ptr; - - ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) { - pr_warning("PERCPU: failed to allocate " - "4k page for cpu%u\n", cpu); - goto enomem; - } - - memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); - pcpu4k_pages[j++] = virt_to_page(ptr); - } - - /* we're ready, commit */ - pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", - pcpu4k_nr_static_pages, static_size); - - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, -1, - -1, NULL, pcpu4k_populate_pte); - goto out_free_ar; - -enomem: - while (--j >= 0) - free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpu4k_pages), pages_size); - return ret; -} - -/* for explicit first chunk allocator selection */ -static char pcpu_chosen_alloc[16] __initdata; - -static int __init percpu_alloc_setup(char *str) -{ - strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); - return 0; -} -early_param("percpu_alloc", percpu_alloc_setup); - static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -441,52 +170,49 @@ static inline void setup_percpu_segment(int cpu) void __init setup_per_cpu_areas(void) { - size_t static_size = __per_cpu_end - __per_cpu_start; unsigned int cpu; unsigned long delta; - size_t pcpu_unit_size; - ssize_t ret; + int rc; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* - * Allocate percpu area. If PSE is supported, try to make use - * of large page mappings. Please read comments on top of - * each allocator for details. + * Allocate percpu area. Embedding allocator is our favorite; + * however, on NUMA configurations, it can result in very + * sparse unit mapping and vmalloc area isn't spacious enough + * on 32bit. Use page in that case. */ - ret = -EINVAL; - if (strlen(pcpu_chosen_alloc)) { - if (strcmp(pcpu_chosen_alloc, "4k")) { - if (!strcmp(pcpu_chosen_alloc, "lpage")) - ret = setup_pcpu_lpage(static_size, true); - else if (!strcmp(pcpu_chosen_alloc, "embed")) - ret = setup_pcpu_embed(static_size, true); - else - pr_warning("PERCPU: unknown allocator %s " - "specified\n", pcpu_chosen_alloc); - if (ret < 0) - pr_warning("PERCPU: %s allocator failed (%zd), " - "falling back to 4k\n", - pcpu_chosen_alloc, ret); - } - } else { - ret = setup_pcpu_lpage(static_size, false); - if (ret < 0) - ret = setup_pcpu_embed(static_size, false); +#ifdef CONFIG_X86_32 + if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif + rc = -EINVAL; + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; + const size_t dyn_size = PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; + + rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, atom_size, + pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); + if (rc < 0) + pr_warning("PERCPU: %s allocator failed (%d), " + "falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); } - if (ret < 0) - ret = setup_pcpu_4k(static_size); - if (ret < 0) - panic("cannot allocate static percpu area (%zu bytes, err=%zd)", - static_size, ret); - - pcpu_unit_size = ret; + if (rc < 0) + rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpup_populate_pte); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { - per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; + per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c new file mode 100644 index 00000000000..34e09938265 --- /dev/null +++ b/arch/x86/kernel/sfi.c @@ -0,0 +1,122 @@ +/* + * sfi.c - x86 architecture SFI support. + * + * Copyright (c) 2009, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#define KMSG_COMPONENT "SFI" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/acpi.h> +#include <linux/init.h> +#include <linux/sfi.h> +#include <linux/io.h> + +#include <asm/io_apic.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/apic.h> + +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; + +void __init mp_sfi_register_lapic_address(unsigned long address) +{ + mp_lapic_addr = address; + + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = read_apic_id(); + + pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid); +} + +/* All CPUs enumerated by SFI must be present and enabled */ +void __cpuinit mp_sfi_register_lapic(u8 id) +{ + if (MAX_APICS - id <= 0) { + pr_warning("Processor #%d invalid (max %d)\n", + id, MAX_APICS); + return; + } + + pr_info("registering lapic[%d]\n", id); + + generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR))); +} + +static int __init sfi_parse_cpus(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_cpu_table_entry *pentry; + int i; + int cpu_num; + + sb = (struct sfi_table_simple *)table; + cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry); + pentry = (struct sfi_cpu_table_entry *)sb->pentry; + + for (i = 0; i < cpu_num; i++) { + mp_sfi_register_lapic(pentry->apic_id); + pentry++; + } + + smp_found_config = 1; + return 0; +} +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +static u32 gsi_base; + +static int __init sfi_parse_ioapic(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_apic_table_entry *pentry; + int i, num; + + sb = (struct sfi_table_simple *)table; + num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry); + pentry = (struct sfi_apic_table_entry *)sb->pentry; + + for (i = 0; i < num; i++) { + mp_register_ioapic(i, pentry->phys_addr, gsi_base); + gsi_base += io_apic_get_redir_entries(i); + pentry++; + } + + WARN(pic_mode, KERN_WARNING + "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n"); + pic_mode = 0; + return 0; +} +#endif /* CONFIG_X86_IO_APIC */ + +/* + * sfi_platform_init(): register lapics & io-apics + */ +int __init sfi_platform_init(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + mp_sfi_register_lapic_address(sfi_lapic_addr); + sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); +#endif +#ifdef CONFIG_X86_IO_APIC + sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic); +#endif + return 0; +} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 81e58238c4c..fbf3b07c856 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -799,15 +799,6 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - /* - * Re-enable any watchpoints before delivering the - * signal to user space. The processor register will - * have been cleared if the watchpoint triggered - * inside the kernel. - */ - if (current->thread.debugreg7) - set_debugreg(current->thread.debugreg7, 7); - /* Whee! Actually deliver the signal. */ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { /* @@ -856,7 +847,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_process(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c36cc1452cd..324f2a44c22 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -47,6 +47,7 @@ #include <linux/bootmem.h> #include <linux/err.h> #include <linux/nmi.h> +#include <linux/tboot.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -323,7 +324,7 @@ notrace static void __cpuinit start_secondary(void *unused) /* enable local interrupts */ local_irq_enable(); - setup_secondary_clock(); + x86_cpuinit.setup_percpu_clockev(); wmb(); cpu_idle(); @@ -1058,12 +1059,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) #endif current_thread_info()->cpu = 0; /* needed? */ for_each_possible_cpu(i) { - alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); - alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); - alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); - cpumask_clear(per_cpu(cpu_core_map, i)); - cpumask_clear(per_cpu(cpu_sibling_map, i)); - cpumask_clear(cpu_data(i).llc_shared_map); + zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); } set_cpu_sibling_map(0); @@ -1113,13 +1111,26 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) printk(KERN_INFO "CPU%d: ", 0); print_cpu_info(&cpu_data(0)); - setup_boot_clock(); + x86_init.timers.setup_percpu_clockev(); if (is_uv_system()) uv_system_init(); + + set_mtrr_aps_delayed_init(); out: preempt_enable(); } + +void arch_enable_nonboot_cpus_begin(void) +{ + set_mtrr_aps_delayed_init(); +} + +void arch_enable_nonboot_cpus_end(void) +{ + mtrr_aps_init(); +} + /* * Early setup to make printk work. */ @@ -1141,6 +1152,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) setup_ioapic_dest(); #endif check_nmi_watchdog(); + mtrr_aps_init(); } static int __initdata setup_possible_cpus = -1; @@ -1238,16 +1250,7 @@ static void __ref remove_cpu_from_maps(int cpu) void cpu_disable_common(void) { int cpu = smp_processor_id(); - /* - * HACK: - * Allow any queued timer interrupts to get serviced - * This is only a temporary solution until we cleanup - * fixup_irqs as we do for IA64. - */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ @@ -1318,6 +1321,7 @@ void play_dead_common(void) void native_play_dead(void) { play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); wbinvd_halt(); } diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d51321ddafd..0157cd26d7c 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -335,4 +335,4 @@ ENTRY(sys_call_table) .long sys_preadv .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c new file mode 100644 index 00000000000..86c9f91b48a --- /dev/null +++ b/arch/x86/kernel/tboot.c @@ -0,0 +1,447 @@ +/* + * tboot.c: main implementation of helper functions used by kernel for + * runtime support of Intel(R) Trusted Execution Technology + * + * Copyright (c) 2006-2009, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <linux/dma_remapping.h> +#include <linux/init_task.h> +#include <linux/spinlock.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/dmar.h> +#include <linux/cpu.h> +#include <linux/pfn.h> +#include <linux/mm.h> +#include <linux/tboot.h> + +#include <asm/trampoline.h> +#include <asm/processor.h> +#include <asm/bootparam.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/fixmap.h> +#include <asm/proto.h> +#include <asm/setup.h> +#include <asm/e820.h> +#include <asm/io.h> + +#include "acpi/realmode/wakeup.h" + +/* Global pointer to shared data; NULL means no measured launch. */ +struct tboot *tboot __read_mostly; + +/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ +#define AP_WAIT_TIMEOUT 1 + +#undef pr_fmt +#define pr_fmt(fmt) "tboot: " fmt + +static u8 tboot_uuid[16] __initdata = TBOOT_UUID; + +void __init tboot_probe(void) +{ + /* Look for valid page-aligned address for shared page. */ + if (!boot_params.tboot_addr) + return; + /* + * also verify that it is mapped as we expect it before calling + * set_fixmap(), to reduce chance of garbage value causing crash + */ + if (!e820_any_mapped(boot_params.tboot_addr, + boot_params.tboot_addr, E820_RESERVED)) { + pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n"); + return; + } + + /* only a natively booted kernel should be using TXT */ + if (paravirt_enabled()) { + pr_warning("non-0 tboot_addr but pv_ops is enabled\n"); + return; + } + + /* Map and check for tboot UUID. */ + set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); + tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); + if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { + pr_warning("tboot at 0x%llx is invalid\n", + boot_params.tboot_addr); + tboot = NULL; + return; + } + if (tboot->version < 5) { + pr_warning("tboot version is invalid: %u\n", tboot->version); + tboot = NULL; + return; + } + + pr_info("found shared page at phys addr 0x%llx:\n", + boot_params.tboot_addr); + pr_debug("version: %d\n", tboot->version); + pr_debug("log_addr: 0x%08x\n", tboot->log_addr); + pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); + pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); + pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); +} + +static pgd_t *tboot_pg_dir; +static struct mm_struct tboot_mm = { + .mm_rb = RB_ROOT, + .pgd = swapper_pg_dir, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .cpu_vm_mask = CPU_MASK_ALL, +}; + +static inline void switch_to_tboot_pt(void) +{ + write_cr3(virt_to_phys(tboot_pg_dir)); +} + +static int map_tboot_page(unsigned long vaddr, unsigned long pfn, + pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(&tboot_mm, vaddr); + pud = pud_alloc(&tboot_mm, pgd, vaddr); + if (!pud) + return -1; + pmd = pmd_alloc(&tboot_mm, pud, vaddr); + if (!pmd) + return -1; + pte = pte_alloc_map(&tboot_mm, pmd, vaddr); + if (!pte) + return -1; + set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); + pte_unmap(pte); + return 0; +} + +static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn, + unsigned long nr) +{ + /* Reuse the original kernel mapping */ + tboot_pg_dir = pgd_alloc(&tboot_mm); + if (!tboot_pg_dir) + return -1; + + for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) { + if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC)) + return -1; + } + + return 0; +} + +static void tboot_create_trampoline(void) +{ + u32 map_base, map_size; + + /* Create identity map for tboot shutdown code. */ + map_base = PFN_DOWN(tboot->tboot_base); + map_size = PFN_UP(tboot->tboot_size); + if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size)) + panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", + map_base, map_size); +} + +#ifdef CONFIG_ACPI_SLEEP + +static void add_mac_region(phys_addr_t start, unsigned long size) +{ + struct tboot_mac_region *mr; + phys_addr_t end = start + size; + + if (start && size) { + mr = &tboot->mac_regions[tboot->num_mac_regions++]; + mr->start = round_down(start, PAGE_SIZE); + mr->size = round_up(end, PAGE_SIZE) - mr->start; + } +} + +static int tboot_setup_sleep(void) +{ + tboot->num_mac_regions = 0; + + /* S3 resume code */ + add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); + +#ifdef CONFIG_X86_TRAMPOLINE + /* AP trampoline code */ + add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE); +#endif + + /* kernel code + data + bss */ + add_mac_region(virt_to_phys(_text), _end - _text); + + tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; + + return 0; +} + +#else /* no CONFIG_ACPI_SLEEP */ + +static int tboot_setup_sleep(void) +{ + /* S3 shutdown requested, but S3 not supported by the kernel... */ + BUG(); + return -1; +} + +#endif + +void tboot_shutdown(u32 shutdown_type) +{ + void (*shutdown)(void); + + if (!tboot_enabled()) + return; + + /* + * if we're being called before the 1:1 mapping is set up then just + * return and let the normal shutdown happen; this should only be + * due to very early panic() + */ + if (!tboot_pg_dir) + return; + + /* if this is S3 then set regions to MAC */ + if (shutdown_type == TB_SHUTDOWN_S3) + if (tboot_setup_sleep()) + return; + + tboot->shutdown_type = shutdown_type; + + switch_to_tboot_pt(); + + shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry; + shutdown(); + + /* should not reach here */ + while (1) + halt(); +} + +static void tboot_copy_fadt(const struct acpi_table_fadt *fadt) +{ +#define TB_COPY_GAS(tbg, g) \ + tbg.space_id = g.space_id; \ + tbg.bit_width = g.bit_width; \ + tbg.bit_offset = g.bit_offset; \ + tbg.access_width = g.access_width; \ + tbg.address = g.address; + + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block); + + /* + * We need phys addr of waking vector, but can't use virt_to_phys() on + * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys + * addr. + */ + tboot->acpi_sinfo.wakeup_vector = fadt->facs + + offsetof(struct acpi_table_facs, firmware_waking_vector); +} + +void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) +{ + static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = { + /* S0,1,2: */ -1, -1, -1, + /* S3: */ TB_SHUTDOWN_S3, + /* S4: */ TB_SHUTDOWN_S4, + /* S5: */ TB_SHUTDOWN_S5 }; + + if (!tboot_enabled()) + return; + + tboot_copy_fadt(&acpi_gbl_FADT); + tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control; + tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control; + /* we always use the 32b wakeup vector */ + tboot->acpi_sinfo.vector_width = 32; + + if (sleep_state >= ACPI_S_STATE_COUNT || + acpi_shutdown_map[sleep_state] == -1) { + pr_warning("unsupported sleep state 0x%x\n", sleep_state); + return; + } + + tboot_shutdown(acpi_shutdown_map[sleep_state]); +} + +static atomic_t ap_wfs_count; + +static int tboot_wait_for_aps(int num_aps) +{ + unsigned long timeout; + + timeout = AP_WAIT_TIMEOUT*HZ; + while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps && + timeout) { + mdelay(1); + timeout--; + } + + if (timeout) + pr_warning("tboot wait for APs timeout\n"); + + return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); +} + +static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action) { + case CPU_DYING: + atomic_inc(&ap_wfs_count); + if (num_online_cpus() == 1) + if (tboot_wait_for_aps(atomic_read(&ap_wfs_count))) + return NOTIFY_BAD; + break; + } + return NOTIFY_OK; +} + +static struct notifier_block tboot_cpu_notifier __cpuinitdata = +{ + .notifier_call = tboot_cpu_callback, +}; + +static __init int tboot_late_init(void) +{ + if (!tboot_enabled()) + return 0; + + tboot_create_trampoline(); + + atomic_set(&ap_wfs_count, 0); + register_hotcpu_notifier(&tboot_cpu_notifier); + return 0; +} + +late_initcall(tboot_late_init); + +/* + * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE) + */ + +#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000 +#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000 + +/* # pages for each config regs space - used by fixmap */ +#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \ + TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT) + +/* offsets from pub/priv config space */ +#define TXTCR_HEAP_BASE 0x0300 +#define TXTCR_HEAP_SIZE 0x0308 + +#define SHA1_SIZE 20 + +struct sha1_hash { + u8 hash[SHA1_SIZE]; +}; + +struct sinit_mle_data { + u32 version; /* currently 6 */ + struct sha1_hash bios_acm_id; + u32 edx_senter_flags; + u64 mseg_valid; + struct sha1_hash sinit_hash; + struct sha1_hash mle_hash; + struct sha1_hash stm_hash; + struct sha1_hash lcp_policy_hash; + u32 lcp_policy_control; + u32 rlp_wakeup_addr; + u32 reserved; + u32 num_mdrs; + u32 mdrs_off; + u32 num_vtd_dmars; + u32 vtd_dmars_off; +} __packed; + +struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl) +{ + void *heap_base, *heap_ptr, *config; + + if (!tboot_enabled()) + return dmar_tbl; + + /* + * ACPI tables may not be DMA protected by tboot, so use DMAR copy + * SINIT saved in SinitMleData in TXT heap (which is DMA protected) + */ + + /* map config space in order to get heap addr */ + config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES * + PAGE_SIZE); + if (!config) + return NULL; + + /* now map TXT heap */ + heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE), + *(u64 *)(config + TXTCR_HEAP_SIZE)); + iounmap(config); + if (!heap_base) + return NULL; + + /* walk heap to SinitMleData */ + /* skip BiosData */ + heap_ptr = heap_base + *(u64 *)heap_base; + /* skip OsMleData */ + heap_ptr += *(u64 *)heap_ptr; + /* skip OsSinitData */ + heap_ptr += *(u64 *)heap_ptr; + /* now points to SinitMleDataSize; set to SinitMleData */ + heap_ptr += sizeof(u64); + /* get addr of DMAR table */ + dmar_tbl = (struct acpi_table_header *)(heap_ptr + + ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off - + sizeof(u64)); + + /* don't unmap heap because dmar.c needs access to this */ + + return dmar_tbl; +} + +int tboot_force_iommu(void) +{ + if (!tboot_enabled()) + return 0; + + if (no_iommu || swiotlb || dmar_disabled) + pr_warning("Forcing Intel-IOMMU to enabled\n"); + + dmar_disabled = 0; +#ifdef CONFIG_SWIOTLB + swiotlb = 0; +#endif + no_iommu = 0; + + return 1; +} diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c new file mode 100644 index 00000000000..be2573448ed --- /dev/null +++ b/arch/x86/kernel/time.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 1991,1992,1995 Linus Torvalds + * Copyright (c) 1994 Alan Modra + * Copyright (c) 1995 Markus Kuhn + * Copyright (c) 1996 Ingo Molnar + * Copyright (c) 1998 Andrea Arcangeli + * Copyright (c) 2002,2006 Vojtech Pavlik + * Copyright (c) 2003 Andi Kleen + * + */ + +#include <linux/clockchips.h> +#include <linux/interrupt.h> +#include <linux/time.h> +#include <linux/mca.h> + +#include <asm/vsyscall.h> +#include <asm/x86_init.h> +#include <asm/i8259.h> +#include <asm/i8253.h> +#include <asm/timer.h> +#include <asm/hpet.h> +#include <asm/time.h> + +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) +int timer_ack; +#endif + +#ifdef CONFIG_X86_64 +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +#endif + +unsigned long profile_pc(struct pt_regs *regs) +{ + unsigned long pc = instruction_pointer(regs); + + if (!user_mode_vm(regs) && in_lock_functions(pc)) { +#ifdef CONFIG_FRAME_POINTER + return *(unsigned long *)(regs->bp + sizeof(long)); +#else + unsigned long *sp = + (unsigned long *)kernel_stack_pointer(regs); + /* + * Return address is either directly at stack pointer + * or above a saved flags. Eflags has bits 22-31 zero, + * kernel addresses don't. + */ + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; +#endif + } + return pc; +} +EXPORT_SYMBOL(profile_pc); + +/* + * Default timer interrupt handler for PIT/HPET + */ +static irqreturn_t timer_interrupt(int irq, void *dev_id) +{ + /* Keep nmi watchdog up to date */ + inc_irq_stat(irq0_irqs); + + /* Optimized out for !IO_APIC and x86_64 */ + if (timer_ack) { + /* + * Subtle, when I/O APICs are used we have to ack timer IRQ + * manually to deassert NMI lines for the watchdog if run + * on an 82489DX-based system. + */ + spin_lock(&i8259A_lock); + outb(0x0c, PIC_MASTER_OCW3); + /* Ack the IRQ; AEOI will end it automatically. */ + inb(PIC_MASTER_POLL); + spin_unlock(&i8259A_lock); + } + + global_clock_event->event_handler(global_clock_event); + + /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ + if (MCA_bus) + outb_p(inb_p(0x61)| 0x80, 0x61); + + return IRQ_HANDLED; +} + +static struct irqaction irq0 = { + .handler = timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, + .name = "timer" +}; + +void __init setup_default_timer_irq(void) +{ + setup_irq(0, &irq0); +} + +/* Default timer init function */ +void __init hpet_time_init(void) +{ + if (!hpet_enable()) + setup_pit_timer(); + setup_default_timer_irq(); +} + +static __init void x86_late_time_init(void) +{ + x86_init.timers.timer_init(); + tsc_init(); +} + +/* + * Initialize TSC and delay the periodic timer init to + * late x86_late_time_init() so ioremap works. + */ +void __init time_init(void) +{ + late_time_init = x86_late_time_init; +} diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c deleted file mode 100644 index 5c5d87f0b2e..00000000000 --- a/arch/x86/kernel/time_32.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (C) 1991, 1992, 1995 Linus Torvalds - * - * This file contains the PC-specific time handling details: - * reading the RTC at bootup, etc.. - * 1994-07-02 Alan Modra - * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime - * 1995-03-26 Markus Kuhn - * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 - * precision CMOS clock update - * 1996-05-03 Ingo Molnar - * fixed time warps in do_[slow|fast]_gettimeoffset() - * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * 1998-09-05 (Various) - * More robust do_fast_gettimeoffset() algorithm implemented - * (works with APM, Cyrix 6x86MX and Centaur C6), - * monotonic gettimeofday() with fast_get_timeoffset(), - * drift-proof precision TSC calibration on boot - * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D. - * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>; - * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>). - * 1998-12-16 Andrea Arcangeli - * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy - * because was not accounting lost_ticks. - * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli - * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to - * serialize accesses to xtime/lost_ticks). - */ - -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/time.h> -#include <linux/mca.h> - -#include <asm/setup.h> -#include <asm/hpet.h> -#include <asm/time.h> -#include <asm/timer.h> - -#include <asm/do_timer.h> - -int timer_ack; - -unsigned long profile_pc(struct pt_regs *regs) -{ - unsigned long pc = instruction_pointer(regs); - -#ifdef CONFIG_SMP - if (!user_mode_vm(regs) && in_lock_functions(pc)) { -#ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + sizeof(long)); -#else - unsigned long *sp = (unsigned long *)®s->sp; - - /* Return address is either directly at stack pointer - or above a saved flags. Eflags has bits 22-31 zero, - kernel addresses don't. */ - if (sp[0] >> 22) - return sp[0]; - if (sp[1] >> 22) - return sp[1]; -#endif - } -#endif - return pc; -} -EXPORT_SYMBOL(profile_pc); - -/* - * This is the same as the above, except we _also_ save the current - * Time Stamp Counter value at the time of the timer interrupt, so that - * we later on can estimate the time of day more exactly. - */ -irqreturn_t timer_interrupt(int irq, void *dev_id) -{ - /* Keep nmi watchdog up to date */ - inc_irq_stat(irq0_irqs); - -#ifdef CONFIG_X86_IO_APIC - if (timer_ack) { - /* - * Subtle, when I/O APICs are used we have to ack timer IRQ - * manually to deassert NMI lines for the watchdog if run - * on an 82489DX-based system. - */ - spin_lock(&i8259A_lock); - outb(0x0c, PIC_MASTER_OCW3); - /* Ack the IRQ; AEOI will end it automatically. */ - inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); - } -#endif - - do_timer_interrupt_hook(); - -#ifdef CONFIG_MCA - if (MCA_bus) { - /* The PS/2 uses level-triggered interrupts. You can't - turn them off, nor would you want to (any attempt to - enable edge-triggered interrupts usually gets intercepted by a - special hardware circuit). Hence we have to acknowledge - the timer interrupt. Through some incredibly stupid - design idea, the reset for IRQ 0 is done by setting the - high bit of the PPI port B (0x61). Note that some PS/2s, - notably the 55SX, work fine if this is removed. */ - - u8 irq_v = inb_p(0x61); /* read the current state */ - outb_p(irq_v | 0x80, 0x61); /* reset the IRQ */ - } -#endif - - return IRQ_HANDLED; -} - -/* Duplicate of time_init() below, with hpet_enable part added */ -void __init hpet_time_init(void) -{ - if (!hpet_enable()) - setup_pit_timer(); - x86_quirk_time_init(); -} - -/* - * This is called directly from init code; we must delay timer setup in the - * HPET case as we can't make the decision to turn on HPET this early in the - * boot process. - * - * The chosen time_init function will usually be hpet_time_init, above, but - * in the case of virtual hardware, an alternative function may be substituted. - */ -void __init time_init(void) -{ - x86_quirk_pre_time_init(); - tsc_init(); - late_time_init = choose_time_init(); -} diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c deleted file mode 100644 index 5ba343e6184..00000000000 --- a/arch/x86/kernel/time_64.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - * "High Precision Event Timer" based timekeeping. - * - * Copyright (c) 1991,1992,1995 Linus Torvalds - * Copyright (c) 1994 Alan Modra - * Copyright (c) 1995 Markus Kuhn - * Copyright (c) 1996 Ingo Molnar - * Copyright (c) 1998 Andrea Arcangeli - * Copyright (c) 2002,2006 Vojtech Pavlik - * Copyright (c) 2003 Andi Kleen - * RTC support code taken from arch/i386/kernel/timers/time_hpet.c - */ - -#include <linux/clockchips.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/time.h> -#include <linux/mca.h> -#include <linux/nmi.h> - -#include <asm/i8253.h> -#include <asm/hpet.h> -#include <asm/vgtod.h> -#include <asm/time.h> -#include <asm/timer.h> - -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; - -unsigned long profile_pc(struct pt_regs *regs) -{ - unsigned long pc = instruction_pointer(regs); - - /* Assume the lock function has either no stack frame or a copy - of flags from PUSHF - Eflags always has bits 22 and up cleared unlike kernel addresses. */ - if (!user_mode_vm(regs) && in_lock_functions(pc)) { -#ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + sizeof(long)); -#else - unsigned long *sp = (unsigned long *)regs->sp; - if (sp[0] >> 22) - return sp[0]; - if (sp[1] >> 22) - return sp[1]; -#endif - } - return pc; -} -EXPORT_SYMBOL(profile_pc); - -static irqreturn_t timer_interrupt(int irq, void *dev_id) -{ - inc_irq_stat(irq0_irqs); - - global_clock_event->event_handler(global_clock_event); - -#ifdef CONFIG_MCA - if (MCA_bus) { - u8 irq_v = inb_p(0x61); /* read the current state */ - outb_p(irq_v|0x80, 0x61); /* reset the IRQ */ - } -#endif - - return IRQ_HANDLED; -} - -/* calibrate_cpu is used on systems with fixed rate TSCs to determine - * processor frequency */ -#define TICK_COUNT 100000000 -unsigned long __init calibrate_cpu(void) -{ - int tsc_start, tsc_now; - int i, no_ctr_free; - unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; - unsigned long flags; - - for (i = 0; i < 4; i++) - if (avail_to_resrv_perfctr_nmi_bit(i)) - break; - no_ctr_free = (i == 4); - if (no_ctr_free) { - WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " - "cpu_khz value may be incorrect.\n"); - i = 3; - rdmsrl(MSR_K7_EVNTSEL3, evntsel3); - wrmsrl(MSR_K7_EVNTSEL3, 0); - rdmsrl(MSR_K7_PERFCTR3, pmc3); - } else { - reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); - reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - local_irq_save(flags); - /* start measuring cycles, incrementing from 0 */ - wrmsrl(MSR_K7_PERFCTR0 + i, 0); - wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); - rdtscl(tsc_start); - do { - rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles(); - } while ((tsc_now - tsc_start) < TICK_COUNT); - - local_irq_restore(flags); - if (no_ctr_free) { - wrmsrl(MSR_K7_EVNTSEL3, 0); - wrmsrl(MSR_K7_PERFCTR3, pmc3); - wrmsrl(MSR_K7_EVNTSEL3, evntsel3); - } else { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - - return pmc_now * tsc_khz / (tsc_now - tsc_start); -} - -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER, - .name = "timer" -}; - -void __init hpet_time_init(void) -{ - if (!hpet_enable()) - setup_pit_timer(); - - setup_irq(0, &irq0); -} - -void __init time_init(void) -{ - tsc_init(); - - late_time_init = choose_time_init(); -} diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 503c1f2e883..1740c85e24b 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -23,8 +23,6 @@ static struct bau_control **uv_bau_table_bases __read_mostly; static int uv_bau_retry_limit __read_mostly; -/* position of pnode (which is nasid>>1): */ -static int uv_nshift __read_mostly; /* base pnode in this partition */ static int uv_partition_base_pnode __read_mostly; @@ -723,7 +721,7 @@ uv_activation_descriptor_init(int node, int pnode) BUG_ON(!adp); pa = uv_gpa(adp); /* need the real nasid*/ - n = pa >> uv_nshift; + n = uv_gpa_to_pnode(pa); m = pa & uv_mmask; uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, @@ -778,7 +776,7 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp) * need the pnode of where the memory was really allocated */ pa = uv_gpa(pqp); - pn = pa >> uv_nshift; + pn = uv_gpa_to_pnode(pa); uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | @@ -843,8 +841,7 @@ static int __init uv_bau_init(void) GFP_KERNEL, cpu_to_node(cur_cpu)); uv_bau_retry_limit = 1; - uv_nshift = uv_hub_info->n_val; - uv_mmask = (1UL << uv_hub_info->n_val) - 1; + uv_mmask = (1UL << uv_hub_info->m_val) - 1; nblades = uv_num_possible_blades(); uv_bau_table_bases = (struct bau_control **) diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index 808031a5ba1..cd022121cab 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -3,8 +3,16 @@ #include <asm/trampoline.h> #include <asm/e820.h> +#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) +#define __trampinit +#define __trampinitdata +#else +#define __trampinit __cpuinit +#define __trampinitdata __cpuinitdata +#endif + /* ready for x86_64 and x86 */ -unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); +unsigned char *__trampinitdata trampoline_base = __va(TRAMPOLINE_BASE); void __init reserve_trampoline_memory(void) { @@ -26,7 +34,7 @@ void __init reserve_trampoline_memory(void) * bootstrap into the page concerned. The caller * has made sure it's suitably aligned. */ -unsigned long setup_trampoline(void) +unsigned long __trampinit setup_trampoline(void) { memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index 66d874e5404..8508237e8e4 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S @@ -28,16 +28,12 @@ */ #include <linux/linkage.h> +#include <linux/init.h> #include <asm/segment.h> #include <asm/page_types.h> /* We can free up trampoline after bootup if cpu hotplug is not supported. */ -#ifndef CONFIG_HOTPLUG_CPU -.section ".cpuinit.data","aw",@progbits -#else -.section .rodata,"a",@progbits -#endif - +__CPUINITRODATA .code16 ENTRY(trampoline_data) diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index cddfb8d386b..3af2dff58b2 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -25,14 +25,19 @@ */ #include <linux/linkage.h> +#include <linux/init.h> #include <asm/pgtable_types.h> #include <asm/page_types.h> #include <asm/msr.h> #include <asm/segment.h> #include <asm/processor-flags.h> +#ifdef CONFIG_ACPI_SLEEP .section .rodata, "a", @progbits - +#else +/* We can free up the trampoline after bootup if cpu hotplug is not supported. */ +__CPUINITRODATA +#endif .code16 ENTRY(trampoline_data) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 83264922a87..33399176512 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -14,7 +14,6 @@ #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/uaccess.h> -#include <linux/utsname.h> #include <linux/kdebug.h> #include <linux/kernel.h> #include <linux/module.h> @@ -59,12 +58,12 @@ #include <asm/mach_traps.h> #ifdef CONFIG_X86_64 +#include <asm/x86_init.h> #include <asm/pgalloc.h> #include <asm/proto.h> #else #include <asm/processor-flags.h> #include <asm/setup.h> -#include <asm/traps.h> asmlinkage int system_call(void); @@ -73,11 +72,9 @@ char ignore_fpu_irq; /* * The IDT has to be page-aligned to simplify the Pentium - * F0 0F bug workaround.. We have a special link segment - * for this. + * F0 0F bug workaround. */ -gate_desc idt_table[NR_VECTORS] - __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; +gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; #endif DECLARE_BITMAP(used_vectors, NR_VECTORS); @@ -532,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - unsigned long condition; + unsigned long dr6; int si_code; - get_debugreg(condition, 6); + get_debugreg(dr6, 6); /* Catch kmemcheck conditions first of all! */ - if (condition & DR_STEP && kmemcheck_trap(regs)) + if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) return; + /* DR6 may or may not be cleared by the CPU */ + set_debugreg(0, 6); /* * The processor cleared BTF, so don't mark that we need it set. */ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); tsk->thread.debugctlmsr = 0; - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) + /* Store the virtualized DR6 value */ + tsk->thread.debugreg6 = dr6; + + if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, + SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; + if (regs->flags & X86_VM_MASK) { + handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, 1); + return; } -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) - goto debug_vm86; -#endif - - /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg6 = condition; - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). + * Single-stepping through system calls: ignore any exceptions in + * kernel space, but re-enable TF when returning to user mode. + * + * We already checked v86 mode above, so we can check for kernel mode + * by just checking the CPL of CS. */ - if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; + if ((dr6 & DR_STEP) && !user_mode(regs)) { + tsk->thread.debugreg6 &= ~DR_STEP; + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->flags &= ~X86_EFLAGS_TF; } - - si_code = get_si_code(condition); - /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code, si_code); - - /* - * Disable additional traps. They'll be re-enabled when - * the signal is delivered. - */ -clear_dr7: - set_debugreg(0, 7); + si_code = get_si_code(tsk->thread.debugreg6); + if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS)) + send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); - return; -#ifdef CONFIG_X86_32 -debug_vm86: - /* reenable preemption: handle_vm86_trap() might sleep */ - dec_preempt_count(); - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - conditional_cli(regs); - return; -#endif - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); return; } @@ -972,7 +948,5 @@ void __init trap_init(void) */ cpu_init(); -#ifdef CONFIG_X86_32 - x86_quirk_trap_init(); -#endif + x86_init.irqs.trap_init(); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 71f4368b357..cd982f48e23 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -17,6 +17,8 @@ #include <asm/time.h> #include <asm/delay.h> #include <asm/hypervisor.h> +#include <asm/nmi.h> +#include <asm/x86_init.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -400,15 +402,9 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; + unsigned long flags, latch, ms, fast_calibrate; int hpet = is_hpet_enabled(), i, loopmin; - hv_tsc_khz = get_hypervisor_tsc_freq(); - if (hv_tsc_khz) { - printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); - return hv_tsc_khz; - } - local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); @@ -566,7 +562,7 @@ int recalibrate_cpu_khz(void) unsigned long cpu_khz_old = cpu_khz; if (cpu_has_tsc) { - tsc_khz = calibrate_tsc(); + tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, @@ -670,7 +666,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || (val == CPUFREQ_RESUMECHANGE)) { - *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) @@ -744,10 +740,16 @@ static cycle_t __vsyscall_fn vread_tsc(void) } #endif +static void resume_tsc(void) +{ + clocksource_tsc.cycle_last = 0; +} + static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, + .resume = resume_tsc, .mask = CLOCKSOURCE_MASK(64), .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS | @@ -761,12 +763,14 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; - printk("Marking TSC unstable due to %s\n", reason); + printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) - clocksource_change_rating(&clocksource_tsc, 0); - else + clocksource_mark_unstable(&clocksource_tsc); + else { + clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; clocksource_tsc.rating = 0; + } } } @@ -852,15 +856,71 @@ static void __init init_tsc_clocksource(void) clocksource_register(&clocksource_tsc); } +#ifdef CONFIG_X86_64 +/* + * calibrate_cpu is used on systems with fixed rate TSCs to determine + * processor frequency + */ +#define TICK_COUNT 100000000 +static unsigned long __init calibrate_cpu(void) +{ + int tsc_start, tsc_now; + int i, no_ctr_free; + unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; + unsigned long flags; + + for (i = 0; i < 4; i++) + if (avail_to_resrv_perfctr_nmi_bit(i)) + break; + no_ctr_free = (i == 4); + if (no_ctr_free) { + WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " + "cpu_khz value may be incorrect.\n"); + i = 3; + rdmsrl(MSR_K7_EVNTSEL3, evntsel3); + wrmsrl(MSR_K7_EVNTSEL3, 0); + rdmsrl(MSR_K7_PERFCTR3, pmc3); + } else { + reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); + reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + local_irq_save(flags); + /* start measuring cycles, incrementing from 0 */ + wrmsrl(MSR_K7_PERFCTR0 + i, 0); + wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); + rdtscl(tsc_start); + do { + rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); + tsc_now = get_cycles(); + } while ((tsc_now - tsc_start) < TICK_COUNT); + + local_irq_restore(flags); + if (no_ctr_free) { + wrmsrl(MSR_K7_EVNTSEL3, 0); + wrmsrl(MSR_K7_PERFCTR3, pmc3); + wrmsrl(MSR_K7_EVNTSEL3, evntsel3); + } else { + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + + return pmc_now * tsc_khz / (tsc_now - tsc_start); +} +#else +static inline unsigned long calibrate_cpu(void) { return cpu_khz; } +#endif + void __init tsc_init(void) { u64 lpj; int cpu; + x86_init.timers.tsc_pre_init(); + if (!cpu_has_tsc) return; - tsc_khz = calibrate_tsc(); + tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; if (!tsc_khz) { @@ -868,11 +928,9 @@ void __init tsc_init(void) return; } -#ifdef CONFIG_X86_64 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) cpu_khz = calibrate_cpu(); -#endif printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 027b5b49899..f37930954d1 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -114,7 +114,7 @@ void __cpuinit check_tsc_sync_source(int cpu) return; if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - pr_info("Skipping synchronization checks as TSC is reliable.\n"); + printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); return; } diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index aeef529917e..61d805df4c9 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -9,10 +9,25 @@ */ #include <linux/module.h> +#include <linux/rbtree.h> #include <linux/irq.h> #include <asm/apic.h> #include <asm/uv/uv_irq.h> +#include <asm/uv/uv_hub.h> + +/* MMR offset and pnode of hub sourcing interrupts for a given irq */ +struct uv_irq_2_mmr_pnode{ + struct rb_node list; + unsigned long offset; + int pnode; + int irq; +}; + +static spinlock_t uv_irq_lock; +static struct rb_root uv_irq_root; + +static int uv_set_irq_affinity(unsigned int, const struct cpumask *); static void uv_noop(unsigned int irq) { @@ -39,25 +54,214 @@ struct irq_chip uv_irq_chip = { .unmask = uv_noop, .eoi = uv_ack_apic, .end = uv_noop, + .set_affinity = uv_set_irq_affinity, }; /* + * Add offset and pnode information of the hub sourcing interrupts to the + * rb tree for a specific irq. + */ +static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade) +{ + struct rb_node **link = &uv_irq_root.rb_node; + struct rb_node *parent = NULL; + struct uv_irq_2_mmr_pnode *n; + struct uv_irq_2_mmr_pnode *e; + unsigned long irqflags; + + n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL, + uv_blade_to_memory_nid(blade)); + if (!n) + return -ENOMEM; + + n->irq = irq; + n->offset = offset; + n->pnode = uv_blade_to_pnode(blade); + spin_lock_irqsave(&uv_irq_lock, irqflags); + /* Find the right place in the rbtree: */ + while (*link) { + parent = *link; + e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list); + + if (unlikely(irq == e->irq)) { + /* irq entry exists */ + e->pnode = uv_blade_to_pnode(blade); + e->offset = offset; + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + kfree(n); + return 0; + } + + if (irq < e->irq) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + /* Insert the node into the rbtree. */ + rb_link_node(&n->list, parent, link); + rb_insert_color(&n->list, &uv_irq_root); + + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return 0; +} + +/* Retrieve offset and pnode information from the rb tree for a specific irq */ +int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode) +{ + struct uv_irq_2_mmr_pnode *e; + struct rb_node *n; + unsigned long irqflags; + + spin_lock_irqsave(&uv_irq_lock, irqflags); + n = uv_irq_root.rb_node; + while (n) { + e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); + + if (e->irq == irq) { + *offset = e->offset; + *pnode = e->pnode; + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return 0; + } + + if (irq < e->irq) + n = n->rb_left; + else + n = n->rb_right; + } + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return -1; +} + +/* + * Re-target the irq to the specified CPU and enable the specified MMR located + * on the specified blade to allow the sending of MSIs to the specified CPU. + */ +static int +arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, + unsigned long mmr_offset, int restrict) +{ + const struct cpumask *eligible_cpu = cpumask_of(cpu); + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg; + int mmr_pnode; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + int err; + + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != + sizeof(unsigned long)); + + cfg = irq_cfg(irq); + + err = assign_irq_vector(irq, cfg, eligible_cpu); + if (err != 0) + return err; + + if (restrict == UV_AFFINITY_CPU) + desc->status |= IRQ_NO_BALANCING; + else + desc->status |= IRQ_MOVE_PCNTXT; + + set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, + irq_name); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + + mmr_pnode = uv_blade_to_pnode(mmr_blade); + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + return irq; +} + +/* + * Disable the specified MMR located on the specified blade so that MSIs are + * longer allowed to be sent. + */ +static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) +{ + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != + sizeof(unsigned long)); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + entry->mask = 1; + + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); +} + +static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; + unsigned int dest; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + unsigned long mmr_offset; + unsigned mmr_pnode; + + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) + return -1; + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = dest; + + /* Get previously stored MMR and pnode of hub sourcing interrupts */ + if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode)) + return -1; + + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + return 0; +} + +/* * Set up a mapping of an available irq and vector, and enable the specified * MMR that defines the MSI that is to be sent to the specified CPU when an * interrupt is raised. */ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, - unsigned long mmr_offset) + unsigned long mmr_offset, int restrict) { - int irq; - int ret; + int irq, ret; + + irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade)); - irq = create_irq(); if (irq <= 0) return -EBUSY; - ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); - if (ret != irq) + ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, + restrict); + if (ret == irq) + uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); + else destroy_irq(irq); return ret; @@ -71,9 +275,28 @@ EXPORT_SYMBOL_GPL(uv_setup_irq); * * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). */ -void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) +void uv_teardown_irq(unsigned int irq) { - arch_disable_uv_irq(mmr_blade, mmr_offset); + struct uv_irq_2_mmr_pnode *e; + struct rb_node *n; + unsigned long irqflags; + + spin_lock_irqsave(&uv_irq_lock, irqflags); + n = uv_irq_root.rb_node; + while (n) { + e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); + if (e->irq == irq) { + arch_disable_uv_irq(e->pnode, e->offset); + rb_erase(n, &uv_irq_root); + kfree(e); + break; + } + if (irq < e->irq) + n = n->rb_left; + else + n = n->rb_right; + } + spin_unlock_irqrestore(&uv_irq_lock, irqflags); destroy_irq(irq); } EXPORT_SYMBOL_GPL(uv_teardown_irq); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 31ffc24eec4..abda6f53e71 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -30,6 +30,7 @@ #include <asm/setup.h> #include <asm/apic.h> #include <asm/e820.h> +#include <asm/time.h> #include <asm/io.h> #include <linux/kernel_stat.h> @@ -53,7 +54,7 @@ int is_visws_box(void) return visws_board_type >= 0; } -static int __init visws_time_init(void) +static void __init visws_time_init(void) { printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -66,21 +67,13 @@ static int __init visws_time_init(void) /* Enable (unmask) the timer interrupt */ co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); - /* - * Zero return means the generic timer setup code will set up - * the standard vector: - */ - return 0; + setup_default_timer_irq(); } -static int __init visws_pre_intr_init(void) +/* Replaces the default init_ISA_irqs in the generic setup */ +static void __init visws_pre_intr_init(void) { init_VISWS_APIC_irqs(); - - /* - * We dont want ISA irqs to be set up by the generic code: - */ - return 1; } /* Quirk for machine specific memory setup. */ @@ -156,12 +149,8 @@ static void visws_machine_power_off(void) outl(PIIX_SPECIAL_STOP, 0xCFC); } -static int __init visws_get_smp_config(unsigned int early) +static void __init visws_get_smp_config(unsigned int early) { - /* - * Prevent MP-table parsing by the generic code: - */ - return 1; } /* @@ -194,7 +183,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) return; } - apic_cpus = apic->apicid_to_cpu_present(m->apicid); + apic->apicid_to_cpu_present(m->apicid, &apic_cpus); physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); /* * Validate version @@ -208,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) apic_version[m->apicid] = ver; } -static int __init visws_find_smp_config(unsigned int reserve) +static void __init visws_find_smp_config(unsigned int reserve) { struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); @@ -230,21 +219,9 @@ static int __init visws_find_smp_config(unsigned int reserve) MP_processor_info(mp++); mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - return 1; } -static int visws_trap_init(void); - -static struct x86_quirks visws_x86_quirks __initdata = { - .arch_time_init = visws_time_init, - .arch_pre_intr_init = visws_pre_intr_init, - .arch_memory_setup = visws_memory_setup, - .arch_intr_init = NULL, - .arch_trap_init = visws_trap_init, - .mach_get_smp_config = visws_get_smp_config, - .mach_find_smp_config = visws_find_smp_config, -}; +static void visws_trap_init(void); void __init visws_early_detect(void) { @@ -257,11 +234,14 @@ void __init visws_early_detect(void) return; /* - * Install special quirks for timer, interrupt and memory setup: - * Fall back to generic behavior for traps: - * Override generic MP-table parsing: + * Override the default platform setup functions */ - x86_quirks = &visws_x86_quirks; + x86_init.resources.memory_setup = visws_memory_setup; + x86_init.mpparse.get_smp_config = visws_get_smp_config; + x86_init.mpparse.find_smp_config = visws_find_smp_config; + x86_init.irqs.pre_vector_init = visws_pre_intr_init; + x86_init.irqs.trap_init = visws_trap_init; + x86_init.timers.timer_init = visws_time_init; /* * Install reboot quirks: @@ -400,12 +380,10 @@ static __init void cobalt_init(void) co_apic_read(CO_APIC_ID)); } -static int __init visws_trap_init(void) +static void __init visws_trap_init(void) { lithium_init(); cobalt_init(); - - return 1; } /* @@ -508,7 +486,7 @@ static void end_cobalt_irq(unsigned int irq) } static struct irq_chip cobalt_irq_type = { - .typename = "Cobalt-APIC", + .name = "Cobalt-APIC", .startup = startup_cobalt_irq, .shutdown = disable_cobalt_irq, .enable = enable_cobalt_irq, @@ -545,7 +523,7 @@ static void end_piix4_master_irq(unsigned int irq) } static struct irq_chip piix4_master_irq_type = { - .typename = "PIIX4-master", + .name = "PIIX4-master", .startup = startup_piix4_master_irq, .ack = ack_cobalt_irq, .end = end_piix4_master_irq, @@ -553,7 +531,7 @@ static struct irq_chip piix4_master_irq_type = { static struct irq_chip piix4_virtual_irq_type = { - .typename = "PIIX4-virtual", + .name = "PIIX4-virtual", .shutdown = disable_8259A_irq, .enable = enable_8259A_irq, .disable = disable_8259A_irq, diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 95a7289e4b0..d430e4c3019 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -648,7 +648,7 @@ static inline int __init activate_vmi(void) pv_info.paravirt_enabled = 1; pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; - pv_info.name = "vmi"; + pv_info.name = "vmi [deprecated]"; pv_init_ops.patch = vmi_patch; @@ -817,15 +817,15 @@ static inline int __init activate_vmi(void) vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); vmi_timer_ops.cancel_alarm = vmi_get_function(VMI_CALL_CancelAlarm); - pv_time_ops.time_init = vmi_time_init; - pv_time_ops.get_wallclock = vmi_get_wallclock; - pv_time_ops.set_wallclock = vmi_set_wallclock; + x86_init.timers.timer_init = vmi_time_init; #ifdef CONFIG_X86_LOCAL_APIC - pv_apic_ops.setup_boot_clock = vmi_time_bsp_init; - pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; + x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init; + x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init; #endif pv_time_ops.sched_clock = vmi_sched_clock; - pv_time_ops.get_tsc_khz = vmi_tsc_khz; + x86_platform.calibrate_tsc = vmi_tsc_khz; + x86_platform.get_wallclock = vmi_get_wallclock; + x86_platform.set_wallclock = vmi_set_wallclock; /* We have true wallclock functions; disable CMOS clock sync */ no_sync_cmos_clock = 1; diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 2b3eb82efee..611b9e2360d 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -68,7 +68,7 @@ unsigned long long vmi_sched_clock(void) return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); } -/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */ +/* x86_platform.calibrate_tsc = vmi_tsc_khz */ unsigned long vmi_tsc_khz(void) { unsigned long long khz; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 9fc178255c0..3c68fe2d46c 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -45,9 +45,9 @@ PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ #ifdef CONFIG_X86_64 - user PT_LOAD FLAGS(7); /* RWE */ + user PT_LOAD FLAGS(5); /* R_E */ #ifdef CONFIG_SMP - percpu PT_LOAD FLAGS(7); /* RWE */ + percpu PT_LOAD FLAGS(6); /* RW_ */ #endif init PT_LOAD FLAGS(7); /* RWE */ #endif @@ -65,17 +65,11 @@ SECTIONS #endif /* Text and read-only data */ - - /* bootstrapping code */ - .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { - _text = .; - *(.text.head) - } :text = 0x9090 - - /* The rest of the text */ .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; + /* bootstrapping code */ + HEAD_TEXT #ifdef CONFIG_X86_32 - /* not really needed, already page aligned */ . = ALIGN(PAGE_SIZE); *(.text.page_aligned) #endif @@ -94,13 +88,7 @@ SECTIONS NOTES :text :note - /* Exception table */ - . = ALIGN(16); - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 + EXCEPTION_TABLE(16) :text = 0x9090 RO_DATA(PAGE_SIZE) @@ -118,7 +106,6 @@ SECTIONS #endif PAGE_ALIGNED_DATA(PAGE_SIZE) - *(.data.idt) CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) @@ -135,24 +122,21 @@ SECTIONS #ifdef CONFIG_X86_64 #define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \ - PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \ - PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) -#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET) #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) -#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + . = ALIGN(4096); + __vsyscall_0 = .; + . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { + .vsyscall_0 : AT(VLOAD(.vsyscall_0)) { *(.vsyscall_0) } :user - __vsyscall_0 = VSYSCALL_VIRT_ADDR; - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) @@ -192,11 +176,9 @@ SECTIONS *(.vsyscall_3) } - . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + . = __vsyscall_0 + PAGE_SIZE; #undef VSYSCALL_ADDR -#undef VSYSCALL_PHYS_ADDR -#undef VSYSCALL_VIRT_ADDR #undef VLOAD_OFFSET #undef VLOAD #undef VVIRT_OFFSET @@ -219,36 +201,12 @@ SECTIONS PERCPU_VADDR(0, :percpu) #endif - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - INIT_TEXT - _einittext = .; - } + INIT_TEXT_SECTION(PAGE_SIZE) #ifdef CONFIG_X86_64 :init #endif - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } - - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } + INIT_DATA_SECTION(16) .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { __x86_cpu_dev_start = .; @@ -256,8 +214,6 @@ SECTIONS __x86_cpu_dev_end = .; } - SECURITY_INIT - . = ALIGN(8); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { __parainstructions = .; @@ -288,15 +244,6 @@ SECTIONS EXIT_DATA } -#ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) PERCPU(PAGE_SIZE) #endif @@ -348,19 +295,19 @@ SECTIONS _end = .; } - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) - } - STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { *(.eh_frame) } } #ifdef CONFIG_X86_32 +/* + * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + */ . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); #else diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 25ee06a80aa..8cb4974ff59 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } @@ -227,19 +228,11 @@ static long __vsyscall(3) venosys_1(void) } #ifdef CONFIG_SYSCTL - -static int -vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return proc_dointvec(ctl, write, filp, buffer, lenp, ppos); -} - static ctl_table kernel_table2[] = { { .procname = "vsyscall64", .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = vsyscall_sysctl_change }, + .proc_handler = proc_dointvec }, {} }; diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 3909e3ba5ce..a1029769b6f 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -30,9 +30,8 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic); EXPORT_SYMBOL(__copy_user_nocache); -EXPORT_SYMBOL(copy_from_user); -EXPORT_SYMBOL(copy_to_user); -EXPORT_SYMBOL(__copy_from_user_inatomic); +EXPORT_SYMBOL(_copy_from_user); +EXPORT_SYMBOL(_copy_to_user); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c new file mode 100644 index 00000000000..d11c5ff7c65 --- /dev/null +++ b/arch/x86/kernel/x86_init.c @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de> + * + * For licencing details see kernel-base/COPYING + */ +#include <linux/init.h> + +#include <asm/bios_ebda.h> +#include <asm/paravirt.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/apic.h> +#include <asm/e820.h> +#include <asm/time.h> +#include <asm/irq.h> +#include <asm/tsc.h> +#include <asm/iommu.h> + +void __cpuinit x86_init_noop(void) { } +void __init x86_init_uint_noop(unsigned int unused) { } +void __init x86_init_pgd_noop(pgd_t *unused) { } +int __init iommu_init_noop(void) { return 0; } +void iommu_shutdown_noop(void) { } + +/* + * The platform setup functions are preset with the default functions + * for standard PC hardware. + */ +struct x86_init_ops x86_init __initdata = { + + .resources = { + .probe_roms = x86_init_noop, + .reserve_resources = reserve_standard_io_resources, + .memory_setup = default_machine_specific_memory_setup, + }, + + .mpparse = { + .mpc_record = x86_init_uint_noop, + .setup_ioapic_ids = x86_init_noop, + .mpc_apic_id = default_mpc_apic_id, + .smp_read_mpc_oem = default_smp_read_mpc_oem, + .mpc_oem_bus_info = default_mpc_oem_bus_info, + .find_smp_config = default_find_smp_config, + .get_smp_config = default_get_smp_config, + }, + + .irqs = { + .pre_vector_init = init_ISA_irqs, + .intr_init = native_init_IRQ, + .trap_init = x86_init_noop, + }, + + .oem = { + .arch_setup = x86_init_noop, + .banner = default_banner, + }, + + .paging = { + .pagetable_setup_start = native_pagetable_setup_start, + .pagetable_setup_done = native_pagetable_setup_done, + }, + + .timers = { + .setup_percpu_clockev = setup_boot_APIC_clock, + .tsc_pre_init = x86_init_noop, + .timer_init = hpet_time_init, + }, + + .iommu = { + .iommu_init = iommu_init_noop, + }, +}; + +struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { + .setup_percpu_clockev = setup_secondary_APIC_clock, +}; + +struct x86_platform_ops x86_platform = { + .calibrate_tsc = native_calibrate_tsc, + .get_wallclock = mach_get_cmos_time, + .set_wallclock = mach_set_rtc_mmss, + .iommu_shutdown = iommu_shutdown_noop, +}; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8600a09e0c6..b84e571f417 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -1,12 +1,8 @@ # # KVM configuration # -config HAVE_KVM - bool -config HAVE_KVM_IRQCHIP - bool - default y +source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION bool "Virtualization" @@ -29,6 +25,9 @@ config KVM select PREEMPT_NOTIFIERS select MMU_NOTIFIER select ANON_INODES + select HAVE_KVM_IRQCHIP + select HAVE_KVM_EVENTFD + select KVM_APIC_ARCHITECTURE ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent @@ -63,18 +62,6 @@ config KVM_AMD To compile this as a module, choose M here: the module will be called kvm-amd. -config KVM_TRACE - bool "KVM trace support" - depends on KVM && SYSFS - select MARKERS - select RELAY - select DEBUG_FS - default n - ---help--- - This option allows reading a trace of kvm-related events through - relayfs. Note the ABI is not considered stable and will be - modified in future updates. - # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/lguest/Kconfig diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index b43c4efafe8..0e7fe78d0f7 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -1,22 +1,19 @@ -# -# Makefile for Kernel-based Virtual Machine module -# - -common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o irq_comm.o) -ifeq ($(CONFIG_KVM_TRACE),y) -common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) -endif -ifeq ($(CONFIG_IOMMU_API),y) -common-objs += $(addprefix ../../../virt/kvm/, iommu.o) -endif EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm -kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ - i8254.o timer.o -obj-$(CONFIG_KVM) += kvm.o -kvm-intel-objs = vmx.o -obj-$(CONFIG_KVM_INTEL) += kvm-intel.o -kvm-amd-objs = svm.o -obj-$(CONFIG_KVM_AMD) += kvm-amd.o +CFLAGS_x86.o := -I. +CFLAGS_svm.o := -I. +CFLAGS_vmx.o := -I. + +kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ + coalesced_mmio.o irq_comm.o eventfd.o) +kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) + +kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ + i8254.o timer.o +kvm-intel-y += vmx.o +kvm-amd-y += svm.o + +obj-$(CONFIG_KVM) += kvm.o +obj-$(CONFIG_KVM_INTEL) += kvm-intel.o +obj-$(CONFIG_KVM_AMD) += kvm-amd.o diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/emulate.c index 616de4628d6..1be5cd640e9 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1,5 +1,5 @@ /****************************************************************************** - * x86_emulate.c + * emulate.c * * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. * @@ -30,7 +30,9 @@ #define DPRINTF(x...) do {} while (0) #endif #include <linux/module.h> -#include <asm/kvm_x86_emulate.h> +#include <asm/kvm_emulate.h> + +#include "mmu.h" /* for is_long_mode() */ /* * Opcode effective-address decode tables. @@ -60,6 +62,7 @@ #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ #define SrcOne (7<<4) /* Implied '1' */ #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ +#define SrcImmU (9<<4) /* Immediate operand, unsigned */ #define SrcMask (0xf<<4) /* Generic ModRM decode. */ #define ModRM (1<<8) @@ -97,11 +100,11 @@ static u32 opcode_table[256] = { /* 0x10 - 0x17 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, /* 0x18 - 0x1F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, /* 0x20 - 0x27 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, @@ -195,7 +198,7 @@ static u32 opcode_table[256] = { ByteOp | SrcImmUByte, SrcImmUByte, /* 0xE8 - 0xEF */ SrcImm | Stack, SrcImm | ImplicitOps, - SrcImm | Src2Imm16, SrcImmByte | ImplicitOps, + SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ @@ -208,7 +211,7 @@ static u32 opcode_table[256] = { static u32 twobyte_table[256] = { /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, + 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, @@ -216,7 +219,9 @@ static u32 twobyte_table[256] = { ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ImplicitOps, 0, ImplicitOps, 0, + ImplicitOps, ImplicitOps, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x47 */ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -319,8 +324,11 @@ static u32 group2_table[] = { }; /* EFLAGS bit definitions. */ +#define EFLG_VM (1<<17) +#define EFLG_RF (1<<16) #define EFLG_OF (1<<11) #define EFLG_DF (1<<10) +#define EFLG_IF (1<<9) #define EFLG_SF (1<<7) #define EFLG_ZF (1<<6) #define EFLG_AF (1<<4) @@ -1027,6 +1035,7 @@ done_prefixes: c->src.type = OP_MEM; break; case SrcImm: + case SrcImmU: c->src.type = OP_IMM; c->src.ptr = (unsigned long *)c->eip; c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; @@ -1044,6 +1053,19 @@ done_prefixes: c->src.val = insn_fetch(s32, 4, c->eip); break; } + if ((c->d & SrcMask) == SrcImmU) { + switch (c->src.bytes) { + case 1: + c->src.val &= 0xff; + break; + case 2: + c->src.val &= 0xffff; + break; + case 4: + c->src.val &= 0xffffffff; + break; + } + } break; case SrcImmByte: case SrcImmUByte: @@ -1375,6 +1397,217 @@ static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) ctxt->interruptibility = mask; } +static inline void +setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, + struct kvm_segment *cs, struct kvm_segment *ss) +{ + memset(cs, 0, sizeof(struct kvm_segment)); + kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); + memset(ss, 0, sizeof(struct kvm_segment)); + + cs->l = 0; /* will be adjusted later */ + cs->base = 0; /* flat segment */ + cs->g = 1; /* 4kb granularity */ + cs->limit = 0xffffffff; /* 4GB limit */ + cs->type = 0x0b; /* Read, Execute, Accessed */ + cs->s = 1; + cs->dpl = 0; /* will be adjusted later */ + cs->present = 1; + cs->db = 1; + + ss->unusable = 0; + ss->base = 0; /* flat segment */ + ss->limit = 0xffffffff; /* 4GB limit */ + ss->g = 1; /* 4kb granularity */ + ss->s = 1; + ss->type = 0x03; /* Read/Write, Accessed */ + ss->db = 1; /* 32bit stack segment */ + ss->dpl = 0; + ss->present = 1; +} + +static int +emulate_syscall(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment cs, ss; + u64 msr_data; + + /* syscall is not available in real mode */ + if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL + || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) + return -1; + + setup_syscalls_segments(ctxt, &cs, &ss); + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + msr_data >>= 32; + cs.selector = (u16)(msr_data & 0xfffc); + ss.selector = (u16)(msr_data + 8); + + if (is_long_mode(ctxt->vcpu)) { + cs.db = 0; + cs.l = 1; + } + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + + c->regs[VCPU_REGS_RCX] = c->eip; + if (is_long_mode(ctxt->vcpu)) { +#ifdef CONFIG_X86_64 + c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; + + kvm_x86_ops->get_msr(ctxt->vcpu, + ctxt->mode == X86EMUL_MODE_PROT64 ? + MSR_LSTAR : MSR_CSTAR, &msr_data); + c->eip = msr_data; + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); + ctxt->eflags &= ~(msr_data | EFLG_RF); +#endif + } else { + /* legacy mode */ + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + c->eip = (u32)msr_data; + + ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + } + + return 0; +} + +static int +emulate_sysenter(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment cs, ss; + u64 msr_data; + + /* inject #UD if LOCK prefix is used */ + if (c->lock_prefix) + return -1; + + /* inject #GP if in real mode or paging is disabled */ + if (ctxt->mode == X86EMUL_MODE_REAL || + !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + + /* XXX sysenter/sysexit have not been tested in 64bit mode. + * Therefore, we inject an #UD. + */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + return -1; + + setup_syscalls_segments(ctxt, &cs, &ss); + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + switch (ctxt->mode) { + case X86EMUL_MODE_PROT32: + if ((msr_data & 0xfffc) == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + break; + case X86EMUL_MODE_PROT64: + if (msr_data == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + break; + } + + ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + cs.selector = (u16)msr_data; + cs.selector &= ~SELECTOR_RPL_MASK; + ss.selector = cs.selector + 8; + ss.selector &= ~SELECTOR_RPL_MASK; + if (ctxt->mode == X86EMUL_MODE_PROT64 + || is_long_mode(ctxt->vcpu)) { + cs.db = 0; + cs.l = 1; + } + + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); + c->eip = msr_data; + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); + c->regs[VCPU_REGS_RSP] = msr_data; + + return 0; +} + +static int +emulate_sysexit(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment cs, ss; + u64 msr_data; + int usermode; + + /* inject #UD if LOCK prefix is used */ + if (c->lock_prefix) + return -1; + + /* inject #GP if in real mode or paging is disabled */ + if (ctxt->mode == X86EMUL_MODE_REAL + || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + + /* sysexit must be called from CPL 0 */ + if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + + setup_syscalls_segments(ctxt, &cs, &ss); + + if ((c->rex_prefix & 0x8) != 0x0) + usermode = X86EMUL_MODE_PROT64; + else + usermode = X86EMUL_MODE_PROT32; + + cs.dpl = 3; + ss.dpl = 3; + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + switch (usermode) { + case X86EMUL_MODE_PROT32: + cs.selector = (u16)(msr_data + 16); + if ((msr_data & 0xfffc) == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + ss.selector = (u16)(msr_data + 24); + break; + case X86EMUL_MODE_PROT64: + cs.selector = (u16)(msr_data + 32); + if (msr_data == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + ss.selector = cs.selector + 8; + cs.db = 0; + cs.l = 1; + break; + } + cs.selector |= SELECTOR_RPL_MASK; + ss.selector |= SELECTOR_RPL_MASK; + + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + + c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; + c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; + + return 0; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1970,6 +2203,12 @@ twobyte_insn: goto cannot_emulate; } break; + case 0x05: /* syscall */ + if (emulate_syscall(ctxt) == -1) + goto cannot_emulate; + else + goto writeback; + break; case 0x06: emulate_clts(ctxt->vcpu); c->dst.type = OP_NONE; @@ -2036,6 +2275,18 @@ twobyte_insn: rc = X86EMUL_CONTINUE; c->dst.type = OP_NONE; break; + case 0x34: /* sysenter */ + if (emulate_sysenter(ctxt) == -1) + goto cannot_emulate; + else + goto writeback; + break; + case 0x35: /* sysexit */ + if (emulate_sysexit(ctxt) == -1) + goto cannot_emulate; + else + goto writeback; + break; case 0x40 ... 0x4f: /* cmov */ c->dst.val = c->dst.orig_val = c->src.val; if (!test_cc(c->b, ctxt->eflags)) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 21f68e00524..144e7f60b5e 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -116,7 +116,7 @@ static s64 __kpit_elapsed(struct kvm *kvm) * itself with the initial count and continues counting * from there. */ - remaining = hrtimer_expires_remaining(&ps->pit_timer.timer); + remaining = hrtimer_get_remaining(&ps->pit_timer.timer); elapsed = ps->pit_timer.period - ktime_to_ns(remaining); elapsed = mod_64(elapsed, ps->pit_timer.period); @@ -231,7 +231,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu) { struct kvm_pit *pit = vcpu->kvm->arch.vpit; - if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack) + if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack) return atomic_read(&pit->pit_state.pit_timer.pending); return 0; } @@ -252,7 +252,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) struct kvm_pit *pit = vcpu->kvm->arch.vpit; struct hrtimer *timer; - if (vcpu->vcpu_id != 0 || !pit) + if (!kvm_vcpu_is_bsp(vcpu) || !pit) return; timer = &pit->pit_state.pit_timer.timer; @@ -294,7 +294,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) pt->timer.function = kvm_timer_fn; pt->t_ops = &kpit_ops; pt->kvm = ps->pit->kvm; - pt->vcpu_id = 0; + pt->vcpu = pt->kvm->bsp_vcpu; atomic_set(&pt->pending, 0); ps->irq_ack = 1; @@ -332,33 +332,62 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) case 1: /* FIXME: enhance mode 4 precision */ case 4: - create_pit_timer(ps, val, 0); + if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { + create_pit_timer(ps, val, 0); + } break; case 2: case 3: - create_pit_timer(ps, val, 1); + if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ + create_pit_timer(ps, val, 1); + } break; default: destroy_pit_timer(&ps->pit_timer); } } -void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val) +void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start) +{ + u8 saved_mode; + if (hpet_legacy_start) { + /* save existing mode for later reenablement */ + saved_mode = kvm->arch.vpit->pit_state.channels[0].mode; + kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */ + pit_load_count(kvm, channel, val); + kvm->arch.vpit->pit_state.channels[0].mode = saved_mode; + } else { + pit_load_count(kvm, channel, val); + } +} + +static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_pit, dev); +} + +static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev) { - mutex_lock(&kvm->arch.vpit->pit_state.lock); - pit_load_count(kvm, channel, val); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); + return container_of(dev, struct kvm_pit, speaker_dev); } -static void pit_ioport_write(struct kvm_io_device *this, - gpa_t addr, int len, const void *data) +static inline int pit_in_range(gpa_t addr) { - struct kvm_pit *pit = (struct kvm_pit *)this->private; + return ((addr >= KVM_PIT_BASE_ADDRESS) && + (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); +} + +static int pit_ioport_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *data) +{ + struct kvm_pit *pit = dev_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; struct kvm *kvm = pit->kvm; int channel, access; struct kvm_kpit_channel_state *s; u32 val = *(u32 *) data; + if (!pit_in_range(addr)) + return -EOPNOTSUPP; val &= 0xff; addr &= KVM_PIT_CHANNEL_MASK; @@ -421,16 +450,19 @@ static void pit_ioport_write(struct kvm_io_device *this, } mutex_unlock(&pit_state->lock); + return 0; } -static void pit_ioport_read(struct kvm_io_device *this, - gpa_t addr, int len, void *data) +static int pit_ioport_read(struct kvm_io_device *this, + gpa_t addr, int len, void *data) { - struct kvm_pit *pit = (struct kvm_pit *)this->private; + struct kvm_pit *pit = dev_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; struct kvm *kvm = pit->kvm; int ret, count; struct kvm_kpit_channel_state *s; + if (!pit_in_range(addr)) + return -EOPNOTSUPP; addr &= KVM_PIT_CHANNEL_MASK; s = &pit_state->channels[addr]; @@ -485,37 +517,36 @@ static void pit_ioport_read(struct kvm_io_device *this, memcpy(data, (char *)&ret, len); mutex_unlock(&pit_state->lock); + return 0; } -static int pit_in_range(struct kvm_io_device *this, gpa_t addr, - int len, int is_write) -{ - return ((addr >= KVM_PIT_BASE_ADDRESS) && - (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); -} - -static void speaker_ioport_write(struct kvm_io_device *this, - gpa_t addr, int len, const void *data) +static int speaker_ioport_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *data) { - struct kvm_pit *pit = (struct kvm_pit *)this->private; + struct kvm_pit *pit = speaker_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; struct kvm *kvm = pit->kvm; u32 val = *(u32 *) data; + if (addr != KVM_SPEAKER_BASE_ADDRESS) + return -EOPNOTSUPP; mutex_lock(&pit_state->lock); pit_state->speaker_data_on = (val >> 1) & 1; pit_set_gate(kvm, 2, val & 1); mutex_unlock(&pit_state->lock); + return 0; } -static void speaker_ioport_read(struct kvm_io_device *this, - gpa_t addr, int len, void *data) +static int speaker_ioport_read(struct kvm_io_device *this, + gpa_t addr, int len, void *data) { - struct kvm_pit *pit = (struct kvm_pit *)this->private; + struct kvm_pit *pit = speaker_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; struct kvm *kvm = pit->kvm; unsigned int refresh_clock; int ret; + if (addr != KVM_SPEAKER_BASE_ADDRESS) + return -EOPNOTSUPP; /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; @@ -527,12 +558,7 @@ static void speaker_ioport_read(struct kvm_io_device *this, len = sizeof(ret); memcpy(data, (char *)&ret, len); mutex_unlock(&pit_state->lock); -} - -static int speaker_in_range(struct kvm_io_device *this, gpa_t addr, - int len, int is_write) -{ - return (addr == KVM_SPEAKER_BASE_ADDRESS); + return 0; } void kvm_pit_reset(struct kvm_pit *pit) @@ -541,6 +567,7 @@ void kvm_pit_reset(struct kvm_pit *pit) struct kvm_kpit_channel_state *c; mutex_lock(&pit->pit_state.lock); + pit->pit_state.flags = 0; for (i = 0; i < 3; i++) { c = &pit->pit_state.channels[i]; c->mode = 0xff; @@ -563,10 +590,22 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) } } -struct kvm_pit *kvm_create_pit(struct kvm *kvm) +static const struct kvm_io_device_ops pit_dev_ops = { + .read = pit_ioport_read, + .write = pit_ioport_write, +}; + +static const struct kvm_io_device_ops speaker_dev_ops = { + .read = speaker_ioport_read, + .write = speaker_ioport_write, +}; + +/* Caller must have writers lock on slots_lock */ +struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; struct kvm_kpit_state *pit_state; + int ret; pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); if (!pit) @@ -582,19 +621,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) mutex_lock(&pit->pit_state.lock); spin_lock_init(&pit->pit_state.inject_lock); - /* Initialize PIO device */ - pit->dev.read = pit_ioport_read; - pit->dev.write = pit_ioport_write; - pit->dev.in_range = pit_in_range; - pit->dev.private = pit; - kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); - - pit->speaker_dev.read = speaker_ioport_read; - pit->speaker_dev.write = speaker_ioport_write; - pit->speaker_dev.in_range = speaker_in_range; - pit->speaker_dev.private = pit; - kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev); - kvm->arch.vpit = pit; pit->kvm = kvm; @@ -613,7 +639,30 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) pit->mask_notifier.func = pit_mask_notifer; kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + kvm_iodevice_init(&pit->dev, &pit_dev_ops); + ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); + if (ret < 0) + goto fail; + + if (flags & KVM_PIT_SPEAKER_DUMMY) { + kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); + ret = __kvm_io_bus_register_dev(&kvm->pio_bus, + &pit->speaker_dev); + if (ret < 0) + goto fail_unregister; + } + return pit; + +fail_unregister: + __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); + +fail: + if (pit->irq_source_id >= 0) + kvm_free_irq_source_id(kvm, pit->irq_source_id); + + kfree(pit); + return NULL; } void kvm_free_pit(struct kvm *kvm) @@ -623,6 +672,8 @@ void kvm_free_pit(struct kvm *kvm) if (kvm->arch.vpit) { kvm_unregister_irq_mask_notifier(kvm, 0, &kvm->arch.vpit->mask_notifier); + kvm_unregister_irq_ack_notifier(kvm, + &kvm->arch.vpit->pit_state.irq_ack_notifier); mutex_lock(&kvm->arch.vpit->pit_state.lock); timer = &kvm->arch.vpit->pit_state.pit_timer.timer; hrtimer_cancel(timer); @@ -637,10 +688,10 @@ static void __inject_pit_timer_intr(struct kvm *kvm) struct kvm_vcpu *vcpu; int i; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->irq_lock); kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->irq_lock); /* * Provides NMI watchdog support via Virtual Wire mode. @@ -652,11 +703,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm) * VCPU0, and only if its LVT0 is in EXTINT mode. */ if (kvm->arch.vapics_in_nmi_mode > 0) - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (vcpu) - kvm_apic_nmi_wd_deliver(vcpu); - } + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_apic_nmi_wd_deliver(vcpu); } void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) @@ -665,7 +713,7 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; struct kvm_kpit_state *ps; - if (vcpu && pit) { + if (pit) { int inject = 0; ps = &pit->pit_state; diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index bbd863ff60b..d4c1c7ffdc0 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -21,6 +21,7 @@ struct kvm_kpit_channel_state { struct kvm_kpit_state { struct kvm_kpit_channel_state channels[3]; + u32 flags; struct kvm_timer pit_timer; bool is_periodic; u32 speaker_data_on; @@ -49,8 +50,8 @@ struct kvm_pit { #define KVM_PIT_CHANNEL_MASK 0x3 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); -struct kvm_pit *kvm_create_pit(struct kvm *kvm); +void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); +struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); void kvm_free_pit(struct kvm *kvm); void kvm_pit_reset(struct kvm_pit *pit); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 1ccb50c74f1..01f15168280 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -30,50 +30,24 @@ #include "irq.h" #include <linux/kvm_host.h> - -static void pic_lock(struct kvm_pic *s) - __acquires(&s->lock) -{ - spin_lock(&s->lock); -} - -static void pic_unlock(struct kvm_pic *s) - __releases(&s->lock) -{ - struct kvm *kvm = s->kvm; - unsigned acks = s->pending_acks; - bool wakeup = s->wakeup_needed; - struct kvm_vcpu *vcpu; - - s->pending_acks = 0; - s->wakeup_needed = false; - - spin_unlock(&s->lock); - - while (acks) { - kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)), - __ffs(acks)); - acks &= acks - 1; - } - - if (wakeup) { - vcpu = s->kvm->vcpus[0]; - if (vcpu) - kvm_vcpu_kick(vcpu); - } -} +#include "trace.h" static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); s->isr_ack |= (1 << irq); + if (s != &s->pics_state->pics[0]) + irq += 8; + kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); } void kvm_pic_clear_isr_ack(struct kvm *kvm) { struct kvm_pic *s = pic_irqchip(kvm); + spin_lock(&s->lock); s->pics[0].isr_ack = 0xff; s->pics[1].isr_ack = 0xff; + spin_unlock(&s->lock); } /* @@ -174,9 +148,9 @@ static void pic_update_irq(struct kvm_pic *s) void kvm_pic_update_irq(struct kvm_pic *s) { - pic_lock(s); + spin_lock(&s->lock); pic_update_irq(s); - pic_unlock(s); + spin_unlock(&s->lock); } int kvm_pic_set_irq(void *opaque, int irq, int level) @@ -184,12 +158,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) struct kvm_pic *s = opaque; int ret = -1; - pic_lock(s); + spin_lock(&s->lock); if (irq >= 0 && irq < PIC_NUM_PINS) { ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); + trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, + s->pics[irq >> 3].imr, ret == 0); } - pic_unlock(s); + spin_unlock(&s->lock); return ret; } @@ -217,7 +193,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); - pic_lock(s); + spin_lock(&s->lock); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -242,8 +218,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); - pic_unlock(s); - kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq); + spin_unlock(&s->lock); return intno; } @@ -252,7 +227,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s) { int irq, irqbase, n; struct kvm *kvm = s->pics_state->irq_request_opaque; - struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; + struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; if (s == &s->pics_state->pics[0]) irqbase = 0; @@ -263,7 +238,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s) if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) if (s->irr & (1 << irq) || s->isr & (1 << irq)) { n = irq + irqbase; - s->pics_state->pending_acks |= 1 << n; + kvm_notify_acked_irq(kvm, SELECT_PIC(n), n); } } s->last_irr = 0; @@ -428,8 +403,7 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1) return s->elcr; } -static int picdev_in_range(struct kvm_io_device *this, gpa_t addr, - int len, int is_write) +static int picdev_in_range(gpa_t addr) { switch (addr) { case 0x20: @@ -444,18 +418,25 @@ static int picdev_in_range(struct kvm_io_device *this, gpa_t addr, } } -static void picdev_write(struct kvm_io_device *this, +static inline struct kvm_pic *to_pic(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_pic, dev); +} + +static int picdev_write(struct kvm_io_device *this, gpa_t addr, int len, const void *val) { - struct kvm_pic *s = this->private; + struct kvm_pic *s = to_pic(this); unsigned char data = *(unsigned char *)val; + if (!picdev_in_range(addr)) + return -EOPNOTSUPP; if (len != 1) { if (printk_ratelimit()) printk(KERN_ERR "PIC: non byte write\n"); - return; + return 0; } - pic_lock(s); + spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -468,21 +449,24 @@ static void picdev_write(struct kvm_io_device *this, elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } - pic_unlock(s); + spin_unlock(&s->lock); + return 0; } -static void picdev_read(struct kvm_io_device *this, - gpa_t addr, int len, void *val) +static int picdev_read(struct kvm_io_device *this, + gpa_t addr, int len, void *val) { - struct kvm_pic *s = this->private; + struct kvm_pic *s = to_pic(this); unsigned char data = 0; + if (!picdev_in_range(addr)) + return -EOPNOTSUPP; if (len != 1) { if (printk_ratelimit()) printk(KERN_ERR "PIC: non byte read\n"); - return; + return 0; } - pic_lock(s); + spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -496,7 +480,8 @@ static void picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; - pic_unlock(s); + spin_unlock(&s->lock); + return 0; } /* @@ -505,20 +490,27 @@ static void picdev_read(struct kvm_io_device *this, static void pic_irq_request(void *opaque, int level) { struct kvm *kvm = opaque; - struct kvm_vcpu *vcpu = kvm->vcpus[0]; + struct kvm_vcpu *vcpu = kvm->bsp_vcpu; struct kvm_pic *s = pic_irqchip(kvm); int irq = pic_get_irq(&s->pics[0]); s->output = level; if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { s->pics[0].isr_ack &= ~(1 << irq); - s->wakeup_needed = true; + kvm_vcpu_kick(vcpu); } } +static const struct kvm_io_device_ops picdev_ops = { + .read = picdev_read, + .write = picdev_write, +}; + struct kvm_pic *kvm_create_pic(struct kvm *kvm) { struct kvm_pic *s; + int ret; + s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; @@ -534,10 +526,12 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) /* * Initialize PIO device */ - s->dev.read = picdev_read; - s->dev.write = picdev_write; - s->dev.in_range = picdev_in_range; - s->dev.private = s; - kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev); + kvm_iodevice_init(&s->dev, &picdev_ops); + ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); + if (ret < 0) { + kfree(s); + return NULL; + } + return s; } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 9f593188129..7d6058a2fd3 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -63,7 +63,6 @@ struct kvm_kpic_state { struct kvm_pic { spinlock_t lock; - bool wakeup_needed; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 1ff819dce7d..7bcc5b6a440 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -29,4 +29,13 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) kvm_register_write(vcpu, VCPU_REGS_RIP, val); } +static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) +{ + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail)) + kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); + + return vcpu->arch.pdptrs[index]; +} + #endif diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h deleted file mode 100644 index ed66e4c078d..00000000000 --- a/arch/x86/kvm/kvm_svm.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef __KVM_SVM_H -#define __KVM_SVM_H - -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/list.h> -#include <linux/kvm_host.h> -#include <asm/msr.h> - -#include <asm/svm.h> - -static const u32 host_save_user_msrs[] = { -#ifdef CONFIG_X86_64 - MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, - MSR_FS_BASE, -#endif - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, -}; - -#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) - -struct kvm_vcpu; - -struct vcpu_svm { - struct kvm_vcpu vcpu; - struct vmcb *vmcb; - unsigned long vmcb_pa; - struct svm_cpu_data *svm_data; - uint64_t asid_generation; - - u64 next_rip; - - u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; - u64 host_gs_base; - unsigned long host_cr2; - - u32 *msrpm; - struct vmcb *hsave; - u64 hsave_msr; - - u64 nested_vmcb; - - /* These are the merged vectors */ - u32 *nested_msrpm; - - /* gpa pointers to the real vectors */ - u64 nested_vmcb_msrpm; -}; - -#endif - diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h index 26bd6ba74e1..55c7524dda5 100644 --- a/arch/x86/kvm/kvm_timer.h +++ b/arch/x86/kvm/kvm_timer.h @@ -6,7 +6,7 @@ struct kvm_timer { bool reinject; struct kvm_timer_ops *t_ops; struct kvm *kvm; - int vcpu_id; + struct kvm_vcpu *vcpu; }; struct kvm_timer_ops { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ae99d83f81a..23c217692ea 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -32,8 +32,11 @@ #include <asm/current.h> #include <asm/apicdef.h> #include <asm/atomic.h> +#include <asm/apicdef.h> #include "kvm_cache_regs.h" #include "irq.h" +#include "trace.h" +#include "x86.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -141,6 +144,26 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; } +void kvm_apic_set_version(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_cpuid_entry2 *feat; + u32 v = APIC_VERSION; + + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); + if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31)))) + v |= APIC_LVR_DIRECTED_EOI; + apic_set_reg(apic, APIC_LVR, v); +} + +static inline int apic_x2apic_mode(struct kvm_lapic *apic) +{ + return apic->vcpu->arch.apic_base & X2APIC_ENABLE; +} + static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ @@ -165,36 +188,52 @@ static int find_highest_vector(void *bitmap) static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) { + apic->irr_pending = true; return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); } -static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) +static inline int apic_search_irr(struct kvm_lapic *apic) { - apic_clear_vector(vec, apic->regs + APIC_IRR); + return find_highest_vector(apic->regs + APIC_IRR); } static inline int apic_find_highest_irr(struct kvm_lapic *apic) { int result; - result = find_highest_vector(apic->regs + APIC_IRR); + if (!apic->irr_pending) + return -1; + + result = apic_search_irr(apic); ASSERT(result == -1 || result >= 16); return result; } +static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) +{ + apic->irr_pending = false; + apic_clear_vector(vec, apic->regs + APIC_IRR); + if (apic_search_irr(apic) != -1) + apic->irr_pending = true; +} + int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; int highest_irr; + /* This may race with setting of irr in __apic_accept_irq() and + * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq + * will cause vmexit immediately and the value will be recalculated + * on the next vmentry. + */ if (!apic) return 0; highest_irr = apic_find_highest_irr(apic); return highest_irr; } -EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode); @@ -251,7 +290,12 @@ int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) { int result = 0; - u8 logical_id; + u32 logical_id; + + if (apic_x2apic_mode(apic)) { + logical_id = apic_get_reg(apic, APIC_LDR); + return logical_id & mda; + } logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); @@ -331,6 +375,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; result = !apic_test_and_set_irr(vector, apic); + trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector, !result); if (!result) { if (trig_mode) apic_debug("level trig mode repeatedly for " @@ -425,7 +471,11 @@ static void apic_set_eoi(struct kvm_lapic *apic) trigger_mode = IOAPIC_LEVEL_TRIG; else trigger_mode = IOAPIC_EDGE_TRIG; - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) { + mutex_lock(&apic->vcpu->kvm->irq_lock); + kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + mutex_unlock(&apic->vcpu->kvm->irq_lock); + } } static void apic_send_ipi(struct kvm_lapic *apic) @@ -440,7 +490,12 @@ static void apic_send_ipi(struct kvm_lapic *apic) irq.level = icr_low & APIC_INT_ASSERT; irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; irq.shorthand = icr_low & APIC_SHORT_MASK; - irq.dest_id = GET_APIC_DEST_FIELD(icr_high); + if (apic_x2apic_mode(apic)) + irq.dest_id = icr_high; + else + irq.dest_id = GET_APIC_DEST_FIELD(icr_high); + + trace_kvm_apic_ipi(icr_low, irq.dest_id); apic_debug("icr_high 0x%x, icr_low 0x%x, " "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " @@ -449,7 +504,9 @@ static void apic_send_ipi(struct kvm_lapic *apic) irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, irq.vector); + mutex_lock(&apic->vcpu->kvm->irq_lock); kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); + mutex_unlock(&apic->vcpu->kvm->irq_lock); } static u32 apic_get_tmcct(struct kvm_lapic *apic) @@ -464,7 +521,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) if (apic_get_reg(apic, APIC_TMICT) == 0) return 0; - remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer); + remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); if (ktime_to_ns(remaining) < 0) remaining = ktime_set(0, 0); @@ -495,12 +552,16 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) { u32 val = 0; - KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); - if (offset >= LAPIC_MMIO_LENGTH) return 0; switch (offset) { + case APIC_ID: + if (apic_x2apic_mode(apic)) + val = kvm_apic_id(apic); + else + val = kvm_apic_id(apic) << 24; + break; case APIC_ARBPRI: printk(KERN_WARNING "Access APIC ARBPRI register " "which is for P6\n"); @@ -522,21 +583,35 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) return val; } -static void apic_mmio_read(struct kvm_io_device *this, - gpa_t address, int len, void *data) +static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_lapic, dev); +} + +static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, + void *data) { - struct kvm_lapic *apic = (struct kvm_lapic *)this->private; - unsigned int offset = address - apic->base_address; unsigned char alignment = offset & 0xf; u32 result; + /* this bitmask has a bit cleared for each reserver register */ + static const u64 rmask = 0x43ff01ffffffe70cULL; if ((alignment + len) > 4) { - printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d", - (unsigned long)address, len); - return; + apic_debug("KVM_APIC_READ: alignment error %x %d\n", + offset, len); + return 1; } + + if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) { + apic_debug("KVM_APIC_READ: read reserved register %x\n", + offset); + return 1; + } + result = __apic_read(apic, offset & ~0xf); + trace_kvm_apic_read(offset, result); + switch (len) { case 1: case 2: @@ -548,6 +623,28 @@ static void apic_mmio_read(struct kvm_io_device *this, "should be 1,2, or 4 instead\n", len); break; } + return 0; +} + +static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) +{ + return apic_hw_enabled(apic) && + addr >= apic->base_address && + addr < apic->base_address + LAPIC_MMIO_LENGTH; +} + +static int apic_mmio_read(struct kvm_io_device *this, + gpa_t address, int len, void *data) +{ + struct kvm_lapic *apic = to_lapic(this); + u32 offset = address - apic->base_address; + + if (!apic_mmio_in_range(apic, address)) + return -EOPNOTSUPP; + + apic_reg_read(apic, offset, len, data); + + return 0; } static void update_divide_count(struct kvm_lapic *apic) @@ -567,12 +664,21 @@ static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now = apic->lapic_timer.timer.base->get_time(); - apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) * + apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) * APIC_BUS_CYCLE_NS * apic->divide_count; atomic_set(&apic->lapic_timer.pending, 0); if (!apic->lapic_timer.period) return; + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (apic_lvtt_period(apic)) { + if (apic->lapic_timer.period < NSEC_PER_MSEC/2) + apic->lapic_timer.period = NSEC_PER_MSEC/2; + } hrtimer_start(&apic->lapic_timer.timer, ktime_add_ns(now, apic->lapic_timer.period), @@ -603,40 +709,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) apic->vcpu->kvm->arch.vapics_in_nmi_mode--; } -static void apic_mmio_write(struct kvm_io_device *this, - gpa_t address, int len, const void *data) +static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) { - struct kvm_lapic *apic = (struct kvm_lapic *)this->private; - unsigned int offset = address - apic->base_address; - unsigned char alignment = offset & 0xf; - u32 val; - - /* - * APIC register must be aligned on 128-bits boundary. - * 32/64/128 bits registers must be accessed thru 32 bits. - * Refer SDM 8.4.1 - */ - if (len != 4 || alignment) { - /* Don't shout loud, $infamous_os would cause only noise. */ - apic_debug("apic write: bad size=%d %lx\n", - len, (long)address); - return; - } - - val = *(u32 *) data; - - /* too common printing */ - if (offset != APIC_EOI) - apic_debug("%s: offset 0x%x with length 0x%x, and value is " - "0x%x\n", __func__, offset, len, val); - - offset &= 0xff0; + int ret = 0; - KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); + trace_kvm_apic_write(reg, val); - switch (offset) { + switch (reg) { case APIC_ID: /* Local APIC ID */ - apic_set_reg(apic, APIC_ID, val); + if (!apic_x2apic_mode(apic)) + apic_set_reg(apic, APIC_ID, val); + else + ret = 1; break; case APIC_TASKPRI: @@ -649,15 +733,24 @@ static void apic_mmio_write(struct kvm_io_device *this, break; case APIC_LDR: - apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); + if (!apic_x2apic_mode(apic)) + apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); + else + ret = 1; break; case APIC_DFR: - apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); + if (!apic_x2apic_mode(apic)) + apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); + else + ret = 1; break; - case APIC_SPIV: - apic_set_reg(apic, APIC_SPIV, val & 0x3ff); + case APIC_SPIV: { + u32 mask = 0x3ff; + if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) + mask |= APIC_SPIV_DIRECTED_EOI; + apic_set_reg(apic, APIC_SPIV, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { int i; u32 lvt_val; @@ -672,7 +765,7 @@ static void apic_mmio_write(struct kvm_io_device *this, } break; - + } case APIC_ICR: /* No delay here, so we always clear the pending bit */ apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); @@ -680,7 +773,9 @@ static void apic_mmio_write(struct kvm_io_device *this, break; case APIC_ICR2: - apic_set_reg(apic, APIC_ICR2, val & 0xff000000); + if (!apic_x2apic_mode(apic)) + val &= 0xff000000; + apic_set_reg(apic, APIC_ICR2, val); break; case APIC_LVT0: @@ -694,8 +789,8 @@ static void apic_mmio_write(struct kvm_io_device *this, if (!apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; - val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4]; - apic_set_reg(apic, offset, val); + val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; + apic_set_reg(apic, reg, val); break; @@ -703,7 +798,7 @@ static void apic_mmio_write(struct kvm_io_device *this, hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); - return; + break; case APIC_TDCR: if (val & 4) @@ -712,27 +807,59 @@ static void apic_mmio_write(struct kvm_io_device *this, update_divide_count(apic); break; + case APIC_ESR: + if (apic_x2apic_mode(apic) && val != 0) { + printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val); + ret = 1; + } + break; + + case APIC_SELF_IPI: + if (apic_x2apic_mode(apic)) { + apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff)); + } else + ret = 1; + break; default: - apic_debug("Local APIC Write to read-only register %x\n", - offset); + ret = 1; break; } - + if (ret) + apic_debug("Local APIC Write to read-only register %x\n", reg); + return ret; } -static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr, - int len, int size) +static int apic_mmio_write(struct kvm_io_device *this, + gpa_t address, int len, const void *data) { - struct kvm_lapic *apic = (struct kvm_lapic *)this->private; - int ret = 0; + struct kvm_lapic *apic = to_lapic(this); + unsigned int offset = address - apic->base_address; + u32 val; + if (!apic_mmio_in_range(apic, address)) + return -EOPNOTSUPP; - if (apic_hw_enabled(apic) && - (addr >= apic->base_address) && - (addr < (apic->base_address + LAPIC_MMIO_LENGTH))) - ret = 1; + /* + * APIC register must be aligned on 128-bits boundary. + * 32/64/128 bits registers must be accessed thru 32 bits. + * Refer SDM 8.4.1 + */ + if (len != 4 || (offset & 0xf)) { + /* Don't shout loud, $infamous_os would cause only noise. */ + apic_debug("apic write: bad size=%d %lx\n", len, (long)address); + return 0; + } - return ret; + val = *(u32*)data; + + /* too common printing */ + if (offset != APIC_EOI) + apic_debug("%s: offset 0x%x with length 0x%x, and value is " + "0x%x\n", __func__, offset, len, val); + + apic_reg_write(apic, offset & 0xff0, val); + + return 0; } void kvm_free_lapic(struct kvm_vcpu *vcpu) @@ -763,7 +890,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) apic_set_tpr(apic, ((cr8 & 0x0f) << 4) | (apic_get_reg(apic, APIC_TASKPRI) & 4)); } -EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) { @@ -776,7 +902,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) return (tpr & 0xf0) >> 4; } -EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) { @@ -787,10 +912,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) vcpu->arch.apic_base = value; return; } - if (apic->vcpu->vcpu_id) + + if (!kvm_vcpu_is_bsp(apic->vcpu)) value &= ~MSR_IA32_APICBASE_BSP; vcpu->arch.apic_base = value; + if (apic_x2apic_mode(apic)) { + u32 id = kvm_apic_id(apic); + u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); + apic_set_reg(apic, APIC_LDR, ldr); + } apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -800,12 +931,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) } -u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.apic_base; -} -EXPORT_SYMBOL_GPL(kvm_lapic_get_base); - void kvm_lapic_reset(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic; @@ -821,7 +946,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); - apic_set_reg(apic, APIC_LVR, APIC_VERSION); + kvm_apic_set_version(apic->vcpu); for (i = 0; i < APIC_LVT_NUM; i++) apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); @@ -842,9 +967,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } + apic->irr_pending = false; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); - if (vcpu->vcpu_id == 0) + if (kvm_vcpu_is_bsp(vcpu)) vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; apic_update_ppr(apic); @@ -855,7 +981,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) vcpu, kvm_apic_id(apic), vcpu->arch.apic_base, apic->base_address); } -EXPORT_SYMBOL_GPL(kvm_lapic_reset); bool kvm_apic_present(struct kvm_vcpu *vcpu) { @@ -866,7 +991,6 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu) { return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); } -EXPORT_SYMBOL_GPL(kvm_lapic_enabled); /* *---------------------------------------------------------------------- @@ -917,6 +1041,11 @@ static struct kvm_timer_ops lapic_timer_ops = { .is_periodic = lapic_is_periodic, }; +static const struct kvm_io_device_ops apic_mmio_ops = { + .read = apic_mmio_read, + .write = apic_mmio_write, +}; + int kvm_create_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic; @@ -945,16 +1074,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) apic->lapic_timer.timer.function = kvm_timer_fn; apic->lapic_timer.t_ops = &lapic_timer_ops; apic->lapic_timer.kvm = vcpu->kvm; - apic->lapic_timer.vcpu_id = vcpu->vcpu_id; + apic->lapic_timer.vcpu = vcpu; apic->base_address = APIC_DEFAULT_PHYS_BASE; vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; kvm_lapic_reset(vcpu); - apic->dev.read = apic_mmio_read; - apic->dev.write = apic_mmio_write; - apic->dev.in_range = apic_mmio_range; - apic->dev.private = apic; + kvm_iodevice_init(&apic->dev, &apic_mmio_ops); return 0; nomem_free_apic: @@ -962,7 +1088,6 @@ nomem_free_apic: nomem: return -ENOMEM; } -EXPORT_SYMBOL_GPL(kvm_create_lapic); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) { @@ -985,7 +1110,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); int r = 0; - if (vcpu->vcpu_id == 0) { + if (kvm_vcpu_is_bsp(vcpu)) { if (!apic_hw_enabled(vcpu->arch.apic)) r = 1; if ((lvt0 & APIC_LVT_MASKED) == 0 && @@ -1025,7 +1150,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) apic->base_address = vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; - apic_set_reg(apic, APIC_LVR, APIC_VERSION); + kvm_apic_set_version(vcpu); + apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); update_divide_count(apic); @@ -1092,3 +1218,35 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) vcpu->arch.apic->vapic_addr = vapic_addr; } + +int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 reg = (msr - APIC_BASE_MSR) << 4; + + if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + return 1; + + /* if this is ICR write vector before command */ + if (msr == 0x830) + apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return apic_reg_write(apic, reg, (u32)data); +} + +int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0; + + if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + return 1; + + if (apic_reg_read(apic, reg, 4, &low)) + return 1; + if (msr == 0x830) + apic_reg_read(apic, APIC_ICR2, 4, &high); + + *data = (((u64)high) << 32) | low; + + return 0; +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index a587f8349c4..40010b09c4a 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -12,6 +12,7 @@ struct kvm_lapic { struct kvm_timer lapic_timer; u32 divide_count; struct kvm_vcpu *vcpu; + bool irr_pending; struct page *regs_page; void *regs; gpa_t vapic_addr; @@ -28,6 +29,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); +void kvm_apic_set_version(struct kvm_vcpu *vcpu); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); @@ -44,4 +46,6 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); +int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0ef5bb2b404..818b92ad82c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -18,6 +18,7 @@ */ #include "mmu.h" +#include "kvm_cache_regs.h" #include <linux/kvm_host.h> #include <linux/types.h> @@ -107,6 +108,9 @@ module_param(oos_shadow, bool, 0644); #define PT32_LEVEL_MASK(level) \ (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) +#define PT32_LVL_OFFSET_MASK(level) \ + (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT32_LEVEL_BITS))) - 1)) #define PT32_INDEX(address, level)\ (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) @@ -115,10 +119,19 @@ module_param(oos_shadow, bool, 0644); #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #define PT64_DIR_BASE_ADDR_MASK \ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) +#define PT64_LVL_ADDR_MASK(level) \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) +#define PT64_LVL_OFFSET_MASK(level) \ + (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) #define PT32_BASE_ADDR_MASK PAGE_MASK #define PT32_DIR_BASE_ADDR_MASK \ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) +#define PT32_LVL_ADDR_MASK(level) \ + (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT32_LEVEL_BITS))) - 1)) #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) @@ -129,6 +142,7 @@ module_param(oos_shadow, bool, 0644); #define PFERR_RSVD_MASK (1U << 3) #define PFERR_FETCH_MASK (1U << 4) +#define PT_PDPE_LEVEL 3 #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 @@ -139,10 +153,15 @@ module_param(oos_shadow, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +#define CREATE_TRACE_POINTS +#include "mmutrace.h" + +#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) + #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) struct kvm_rmap_desc { - u64 *shadow_ptes[RMAP_EXT]; + u64 *sptes[RMAP_EXT]; struct kvm_rmap_desc *more; }; @@ -239,16 +258,25 @@ static int is_writeble_pte(unsigned long pte) return pte & PT_WRITABLE_MASK; } -static int is_dirty_pte(unsigned long pte) +static int is_dirty_gpte(unsigned long pte) { - return pte & shadow_dirty_mask; + return pte & PT_DIRTY_MASK; } -static int is_rmap_pte(u64 pte) +static int is_rmap_spte(u64 pte) { return is_shadow_present_pte(pte); } +static int is_last_spte(u64 pte, int level) +{ + if (level == PT_PAGE_TABLE_LEVEL) + return 1; + if (is_large_pte(pte)) + return 1; + return 0; +} + static pfn_t spte_to_pfn(u64 pte) { return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -261,7 +289,7 @@ static gfn_t pse36_gfn_delta(u32 gpte) return (gpte & PT32_DIR_PSE36_MASK) << shift; } -static void set_shadow_pte(u64 *sptep, u64 spte) +static void __set_spte(u64 *sptep, u64 spte) { #ifdef CONFIG_X86_64 set_64bit((unsigned long *)sptep, spte); @@ -380,37 +408,52 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) * Return the pointer to the largepage write count for a given * gfn, handling slots that are not large page aligned. */ -static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) +static int *slot_largepage_idx(gfn_t gfn, + struct kvm_memory_slot *slot, + int level) { unsigned long idx; - idx = (gfn / KVM_PAGES_PER_HPAGE) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE); - return &slot->lpage_info[idx].write_count; + idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - + (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); + return &slot->lpage_info[level - 2][idx].write_count; } static void account_shadowed(struct kvm *kvm, gfn_t gfn) { + struct kvm_memory_slot *slot; int *write_count; + int i; gfn = unalias_gfn(kvm, gfn); - write_count = slot_largepage_idx(gfn, - gfn_to_memslot_unaliased(kvm, gfn)); - *write_count += 1; + + slot = gfn_to_memslot_unaliased(kvm, gfn); + for (i = PT_DIRECTORY_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + write_count = slot_largepage_idx(gfn, slot, i); + *write_count += 1; + } } static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) { + struct kvm_memory_slot *slot; int *write_count; + int i; gfn = unalias_gfn(kvm, gfn); - write_count = slot_largepage_idx(gfn, - gfn_to_memslot_unaliased(kvm, gfn)); - *write_count -= 1; - WARN_ON(*write_count < 0); + for (i = PT_DIRECTORY_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + slot = gfn_to_memslot_unaliased(kvm, gfn); + write_count = slot_largepage_idx(gfn, slot, i); + *write_count -= 1; + WARN_ON(*write_count < 0); + } } -static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) +static int has_wrprotected_page(struct kvm *kvm, + gfn_t gfn, + int level) { struct kvm_memory_slot *slot; int *largepage_idx; @@ -418,47 +461,67 @@ static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) gfn = unalias_gfn(kvm, gfn); slot = gfn_to_memslot_unaliased(kvm, gfn); if (slot) { - largepage_idx = slot_largepage_idx(gfn, slot); + largepage_idx = slot_largepage_idx(gfn, slot, level); return *largepage_idx; } return 1; } -static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) +static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { + unsigned long page_size = PAGE_SIZE; struct vm_area_struct *vma; unsigned long addr; - int ret = 0; + int i, ret = 0; addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) - return ret; + return page_size; down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); - if (vma && is_vm_hugetlb_page(vma)) - ret = 1; + if (!vma) + goto out; + + page_size = vma_kernel_pagesize(vma); + +out: up_read(¤t->mm->mmap_sem); + for (i = PT_PAGE_TABLE_LEVEL; + i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { + if (page_size >= KVM_HPAGE_SIZE(i)) + ret = i; + else + break; + } + return ret; } -static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; - - if (has_wrprotected_page(vcpu->kvm, large_gfn)) - return 0; - - if (!host_largepage_backed(vcpu->kvm, large_gfn)) - return 0; + int host_level; + int level = PT_PAGE_TABLE_LEVEL; slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) - return 0; + return PT_PAGE_TABLE_LEVEL; - return 1; + host_level = host_mapping_level(vcpu->kvm, large_gfn); + + if (host_level == PT_PAGE_TABLE_LEVEL) + return host_level; + + for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { + + if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) + break; + } + + return level - 1; } /* @@ -466,19 +529,19 @@ static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) * Note: gfn must be unaliased before this function get called */ -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) +static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) { struct kvm_memory_slot *slot; unsigned long idx; slot = gfn_to_memslot(kvm, gfn); - if (!lpage) + if (likely(level == PT_PAGE_TABLE_LEVEL)) return &slot->rmap[gfn - slot->base_gfn]; - idx = (gfn / KVM_PAGES_PER_HPAGE) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE); + idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - + (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); - return &slot->lpage_info[idx].rmap_pde; + return &slot->lpage_info[level - 2][idx].rmap_pde; } /* @@ -494,42 +557,42 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) * the spte was not added. * */ -static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) +static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { struct kvm_mmu_page *sp; struct kvm_rmap_desc *desc; unsigned long *rmapp; int i, count = 0; - if (!is_rmap_pte(*spte)) + if (!is_rmap_spte(*spte)) return count; gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); sp->gfns[spte - sp->spt] = gfn; - rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); if (!*rmapp) { rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); *rmapp = (unsigned long)spte; } else if (!(*rmapp & 1)) { rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); desc = mmu_alloc_rmap_desc(vcpu); - desc->shadow_ptes[0] = (u64 *)*rmapp; - desc->shadow_ptes[1] = spte; + desc->sptes[0] = (u64 *)*rmapp; + desc->sptes[1] = spte; *rmapp = (unsigned long)desc | 1; } else { rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) { + while (desc->sptes[RMAP_EXT-1] && desc->more) { desc = desc->more; count += RMAP_EXT; } - if (desc->shadow_ptes[RMAP_EXT-1]) { + if (desc->sptes[RMAP_EXT-1]) { desc->more = mmu_alloc_rmap_desc(vcpu); desc = desc->more; } - for (i = 0; desc->shadow_ptes[i]; ++i) + for (i = 0; desc->sptes[i]; ++i) ; - desc->shadow_ptes[i] = spte; + desc->sptes[i] = spte; } return count; } @@ -541,14 +604,14 @@ static void rmap_desc_remove_entry(unsigned long *rmapp, { int j; - for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) + for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) ; - desc->shadow_ptes[i] = desc->shadow_ptes[j]; - desc->shadow_ptes[j] = NULL; + desc->sptes[i] = desc->sptes[j]; + desc->sptes[j] = NULL; if (j != 0) return; if (!prev_desc && !desc->more) - *rmapp = (unsigned long)desc->shadow_ptes[0]; + *rmapp = (unsigned long)desc->sptes[0]; else if (prev_desc) prev_desc->more = desc->more; @@ -566,17 +629,15 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) unsigned long *rmapp; int i; - if (!is_rmap_pte(*spte)) + if (!is_rmap_spte(*spte)) return; sp = page_header(__pa(spte)); pfn = spte_to_pfn(*spte); if (*spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); if (is_writeble_pte(*spte)) - kvm_release_pfn_dirty(pfn); - else - kvm_release_pfn_clean(pfn); - rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte)); + kvm_set_pfn_dirty(pfn); + rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); if (!*rmapp) { printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); BUG(); @@ -593,8 +654,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); prev_desc = NULL; while (desc) { - for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) - if (desc->shadow_ptes[i] == spte) { + for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) + if (desc->sptes[i] == spte) { rmap_desc_remove_entry(rmapp, desc, i, prev_desc); @@ -625,10 +686,10 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) prev_desc = NULL; prev_spte = NULL; while (desc) { - for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { + for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { if (prev_spte == spte) - return desc->shadow_ptes[i]; - prev_spte = desc->shadow_ptes[i]; + return desc->sptes[i]; + prev_spte = desc->sptes[i]; } desc = desc->more; } @@ -639,10 +700,10 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) { unsigned long *rmapp; u64 *spte; - int write_protected = 0; + int i, write_protected = 0; gfn = unalias_gfn(kvm, gfn); - rmapp = gfn_to_rmap(kvm, gfn, 0); + rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); spte = rmap_next(kvm, rmapp, NULL); while (spte) { @@ -650,7 +711,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); if (is_writeble_pte(*spte)) { - set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); + __set_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } spte = rmap_next(kvm, rmapp, spte); @@ -664,27 +725,31 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) } /* check for huge page mappings */ - rmapp = gfn_to_rmap(kvm, gfn, 1); - spte = rmap_next(kvm, rmapp, NULL); - while (spte) { - BUG_ON(!spte); - BUG_ON(!(*spte & PT_PRESENT_MASK)); - BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); - pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); - if (is_writeble_pte(*spte)) { - rmap_remove(kvm, spte); - --kvm->stat.lpages; - set_shadow_pte(spte, shadow_trap_nonpresent_pte); - spte = NULL; - write_protected = 1; + for (i = PT_DIRECTORY_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + rmapp = gfn_to_rmap(kvm, gfn, i); + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!spte); + BUG_ON(!(*spte & PT_PRESENT_MASK)); + BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); + pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); + if (is_writeble_pte(*spte)) { + rmap_remove(kvm, spte); + --kvm->stat.lpages; + __set_spte(spte, shadow_trap_nonpresent_pte); + spte = NULL; + write_protected = 1; + } + spte = rmap_next(kvm, rmapp, spte); } - spte = rmap_next(kvm, rmapp, spte); } return write_protected; } -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long data) { u64 *spte; int need_tlb_flush = 0; @@ -693,16 +758,55 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); rmap_remove(kvm, spte); - set_shadow_pte(spte, shadow_trap_nonpresent_pte); + __set_spte(spte, shadow_trap_nonpresent_pte); need_tlb_flush = 1; } return need_tlb_flush; } +static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long data) +{ + int need_flush = 0; + u64 *spte, new_spte; + pte_t *ptep = (pte_t *)data; + pfn_t new_pfn; + + WARN_ON(pte_huge(*ptep)); + new_pfn = pte_pfn(*ptep); + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!is_shadow_present_pte(*spte)); + rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); + need_flush = 1; + if (pte_write(*ptep)) { + rmap_remove(kvm, spte); + __set_spte(spte, shadow_trap_nonpresent_pte); + spte = rmap_next(kvm, rmapp, NULL); + } else { + new_spte = *spte &~ (PT64_BASE_ADDR_MASK); + new_spte |= (u64)new_pfn << PAGE_SHIFT; + + new_spte &= ~PT_WRITABLE_MASK; + new_spte &= ~SPTE_HOST_WRITEABLE; + if (is_writeble_pte(*spte)) + kvm_set_pfn_dirty(spte_to_pfn(*spte)); + __set_spte(spte, new_spte); + spte = rmap_next(kvm, rmapp, spte); + } + } + if (need_flush) + kvm_flush_remote_tlbs(kvm); + + return 0; +} + static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - int (*handler)(struct kvm *kvm, unsigned long *rmapp)) + unsigned long data, + int (*handler)(struct kvm *kvm, unsigned long *rmapp, + unsigned long data)) { - int i; + int i, j; int retval = 0; /* @@ -721,11 +825,17 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; - retval |= handler(kvm, &memslot->rmap[gfn_offset]); - retval |= handler(kvm, - &memslot->lpage_info[ - gfn_offset / - KVM_PAGES_PER_HPAGE].rmap_pde); + + retval |= handler(kvm, &memslot->rmap[gfn_offset], + data); + + for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { + int idx = gfn_offset; + idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); + retval |= handler(kvm, + &memslot->lpage_info[j][idx].rmap_pde, + data); + } } } @@ -734,10 +844,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); + return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); } -static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); +} + +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long data) { u64 *spte; int young = 0; @@ -763,20 +879,23 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) #define RMAP_RECYCLE_THRESHOLD 1000 -static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage) +static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { unsigned long *rmapp; + struct kvm_mmu_page *sp; + + sp = page_header(__pa(spte)); gfn = unalias_gfn(vcpu->kvm, gfn); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); - kvm_unmap_rmapp(vcpu->kvm, rmapp); + kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); kvm_flush_remote_tlbs(vcpu->kvm); } int kvm_age_hva(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, kvm_age_rmapp); + return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); } #ifdef MMU_DEBUG @@ -1109,6 +1228,7 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 1; } + trace_kvm_mmu_sync_page(sp); if (rmap_write_protect(vcpu->kvm, sp->gfn)) kvm_flush_remote_tlbs(vcpu->kvm); kvm_unlink_unsync_page(vcpu->kvm, sp); @@ -1231,8 +1351,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - pgprintk("%s: looking gfn %lx role %x\n", __func__, - gfn, role.word); index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) @@ -1249,14 +1367,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); kvm_mmu_mark_parents_unsync(vcpu, sp); } - pgprintk("%s: found\n", __func__); + trace_kvm_mmu_get_page(sp, false); return sp; } ++vcpu->kvm->stat.mmu_cache_miss; sp = kvm_mmu_alloc_page(vcpu, parent_pte); if (!sp) return sp; - pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, bucket); @@ -1269,6 +1386,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, vcpu->arch.mmu.prefetch_page(vcpu, sp); else nonpaging_prefetch_page(vcpu, sp); + trace_kvm_mmu_get_page(sp, true); return sp; } @@ -1292,6 +1410,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) { if (iterator->level < PT_PAGE_TABLE_LEVEL) return false; + + if (iterator->level == PT_PAGE_TABLE_LEVEL) + if (is_large_pte(*iterator->sptep)) + return false; + iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; return true; @@ -1312,25 +1435,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, pt = sp->spt; - if (sp->role.level == PT_PAGE_TABLE_LEVEL) { - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (is_shadow_present_pte(pt[i])) - rmap_remove(kvm, &pt[i]); - pt[i] = shadow_trap_nonpresent_pte; - } - return; - } - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { ent = pt[i]; if (is_shadow_present_pte(ent)) { - if (!is_large_pte(ent)) { + if (!is_last_spte(ent, sp->role.level)) { ent &= PT64_BASE_ADDR_MASK; mmu_page_remove_parent_pte(page_header(ent), &pt[i]); } else { - --kvm->stat.lpages; + if (is_large_pte(ent)) + --kvm->stat.lpages; rmap_remove(kvm, &pt[i]); } } @@ -1346,10 +1461,10 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) { int i; + struct kvm_vcpu *vcpu; - for (i = 0; i < KVM_MAX_VCPUS; ++i) - if (kvm->vcpus[i]) - kvm->vcpus[i]->arch.last_pte_updated = NULL; + kvm_for_each_vcpu(i, vcpu, kvm) + vcpu->arch.last_pte_updated = NULL; } static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -1368,7 +1483,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) } BUG_ON(!parent_pte); kvm_mmu_put_page(sp, parent_pte); - set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); + __set_spte(parent_pte, shadow_trap_nonpresent_pte); } } @@ -1400,6 +1515,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm, static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) { int ret; + + trace_kvm_mmu_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; ret = mmu_zap_unsync_children(kvm, sp); kvm_mmu_page_unlink_children(kvm, sp); @@ -1516,7 +1633,7 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp) for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { if (pt[i] == shadow_notrap_nonpresent_pte) - set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte); + __set_spte(&pt[i], shadow_trap_nonpresent_pte); } } @@ -1646,6 +1763,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) struct kvm_mmu_page *s; struct hlist_node *node, *n; + trace_kvm_mmu_unsync_page(sp); index = kvm_page_table_hashfn(sp->gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; /* don't unsync if pagetable is shadowed with multiple roles */ @@ -1682,11 +1800,11 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return 0; } -static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, +static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int user_fault, - int write_fault, int dirty, int largepage, + int write_fault, int dirty, int level, gfn_t gfn, pfn_t pfn, bool speculative, - bool can_unsync) + bool can_unsync, bool reset_host_protection) { u64 spte; int ret = 0; @@ -1707,18 +1825,22 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, spte |= shadow_nx_mask; if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; - if (largepage) + if (level > PT_PAGE_TABLE_LEVEL) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, kvm_is_mmio_pfn(pfn)); + if (reset_host_protection) + spte |= SPTE_HOST_WRITEABLE; + spte |= (u64)pfn << PAGE_SHIFT; if ((pte_access & ACC_WRITE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { - if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { + if (level > PT_PAGE_TABLE_LEVEL && + has_wrprotected_page(vcpu->kvm, gfn, level)) { ret = 1; spte = shadow_trap_nonpresent_pte; goto set_pte; @@ -1732,7 +1854,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, * is responsibility of mmu_get_page / kvm_sync_page. * Same reasoning can be applied to dirty page accounting. */ - if (!can_unsync && is_writeble_pte(*shadow_pte)) + if (!can_unsync && is_writeble_pte(*sptep)) goto set_pte; if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { @@ -1749,65 +1871,68 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, mark_page_dirty(vcpu->kvm, gfn); set_pte: - set_shadow_pte(shadow_pte, spte); + __set_spte(sptep, spte); return ret; } -static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, +static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, - int *ptwrite, int largepage, gfn_t gfn, - pfn_t pfn, bool speculative) + int *ptwrite, int level, gfn_t gfn, + pfn_t pfn, bool speculative, + bool reset_host_protection) { int was_rmapped = 0; - int was_writeble = is_writeble_pte(*shadow_pte); + int was_writeble = is_writeble_pte(*sptep); int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" " user_fault %d gfn %lx\n", - __func__, *shadow_pte, pt_access, + __func__, *sptep, pt_access, write_fault, user_fault, gfn); - if (is_rmap_pte(*shadow_pte)) { + if (is_rmap_spte(*sptep)) { /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. */ - if (largepage && !is_large_pte(*shadow_pte)) { + if (level > PT_PAGE_TABLE_LEVEL && + !is_large_pte(*sptep)) { struct kvm_mmu_page *child; - u64 pte = *shadow_pte; + u64 pte = *sptep; child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(child, shadow_pte); - } else if (pfn != spte_to_pfn(*shadow_pte)) { + mmu_page_remove_parent_pte(child, sptep); + } else if (pfn != spte_to_pfn(*sptep)) { pgprintk("hfn old %lx new %lx\n", - spte_to_pfn(*shadow_pte), pfn); - rmap_remove(vcpu->kvm, shadow_pte); + spte_to_pfn(*sptep), pfn); + rmap_remove(vcpu->kvm, sptep); } else was_rmapped = 1; } - if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, - dirty, largepage, gfn, pfn, speculative, true)) { + + if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, + dirty, level, gfn, pfn, speculative, true, + reset_host_protection)) { if (write_fault) *ptwrite = 1; kvm_x86_ops->tlb_flush(vcpu); } - pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); + pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", - is_large_pte(*shadow_pte)? "2MB" : "4kB", - is_present_pte(*shadow_pte)?"RW":"R", gfn, - *shadow_pte, shadow_pte); - if (!was_rmapped && is_large_pte(*shadow_pte)) + is_large_pte(*sptep)? "2MB" : "4kB", + *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, + *sptep, sptep); + if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; - page_header_update_slot(vcpu->kvm, shadow_pte, gfn); + page_header_update_slot(vcpu->kvm, sptep, gfn); if (!was_rmapped) { - rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage); - if (!is_rmap_pte(*shadow_pte)) - kvm_release_pfn_clean(pfn); + rmap_count = rmap_add(vcpu, sptep, gfn); + kvm_release_pfn_clean(pfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) - rmap_recycle(vcpu, gfn, largepage); + rmap_recycle(vcpu, sptep, gfn); } else { if (was_writeble) kvm_release_pfn_dirty(pfn); @@ -1815,7 +1940,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, kvm_release_pfn_clean(pfn); } if (speculative) { - vcpu->arch.last_pte_updated = shadow_pte; + vcpu->arch.last_pte_updated = sptep; vcpu->arch.last_pte_gfn = gfn; } } @@ -1825,7 +1950,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int largepage, gfn_t gfn, pfn_t pfn) + int level, gfn_t gfn, pfn_t pfn) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -1833,11 +1958,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, gfn_t pseudo_gfn; for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { - if (iterator.level == PT_PAGE_TABLE_LEVEL - || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { + if (iterator.level == level) { mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 0, write, 1, &pt_write, - largepage, gfn, pfn, false); + level, gfn, pfn, false, true); ++vcpu->stat.pf_fixed; break; } @@ -1853,10 +1977,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, return -ENOMEM; } - set_shadow_pte(iterator.sptep, - __pa(sp->spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask); + __set_spte(iterator.sptep, + __pa(sp->spt) + | PT_PRESENT_MASK | PT_WRITABLE_MASK + | shadow_user_mask | shadow_x_mask); } } return pt_write; @@ -1865,14 +1989,20 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) { int r; - int largepage = 0; + int level; pfn_t pfn; unsigned long mmu_seq; - if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { - gfn &= ~(KVM_PAGES_PER_HPAGE-1); - largepage = 1; - } + level = mapping_level(vcpu, gfn); + + /* + * This path builds a PAE pagetable - so we can map 2mb pages at + * maximum. Therefore check if the level is larger than that. + */ + if (level > PT_DIRECTORY_LEVEL) + level = PT_DIRECTORY_LEVEL; + + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -1888,7 +2018,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); - r = __direct_map(vcpu, v, write, largepage, gfn, pfn); + r = __direct_map(vcpu, v, write, level, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); @@ -1954,6 +2084,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) gfn_t root_gfn; struct kvm_mmu_page *sp; int direct = 0; + u64 pdptr; root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; @@ -1981,11 +2112,12 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - if (!is_present_pte(vcpu->arch.pdptrs[i])) { + pdptr = kvm_pdptr_read(vcpu, i); + if (!is_present_gpte(pdptr)) { vcpu->arch.mmu.pae_root[i] = 0; continue; } - root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; + root_gfn = pdptr >> PAGE_SHIFT; } else if (vcpu->arch.mmu.root_level == 0) root_gfn = 0; if (mmu_check_root(vcpu, root_gfn)) @@ -2062,7 +2194,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, { pfn_t pfn; int r; - int largepage = 0; + int level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; @@ -2073,10 +2205,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, if (r) return r; - if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { - gfn &= ~(KVM_PAGES_PER_HPAGE-1); - largepage = 1; - } + level = mapping_level(vcpu, gfn); + + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); @@ -2089,7 +2221,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, goto out_unlock; kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, - largepage, gfn, pfn); + level, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; @@ -2206,7 +2338,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) context->rsvd_bits_mask[0][0] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; - context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2]; + context->rsvd_bits_mask[1][2] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | + rsvd_bits(13, 29); context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); /* large page */ @@ -2357,8 +2491,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); if (r) goto out; + /* set_cr3() should ensure TLB has been flushed */ kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); - kvm_mmu_flush_tlb(vcpu); out: return r; } @@ -2378,15 +2512,14 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, pte = *spte; if (is_shadow_present_pte(pte)) { - if (sp->role.level == PT_PAGE_TABLE_LEVEL || - is_large_pte(pte)) + if (is_last_spte(pte, sp->role.level)) rmap_remove(vcpu->kvm, spte); else { child = page_header(pte & PT64_BASE_ADDR_MASK); mmu_page_remove_parent_pte(child, spte); } } - set_shadow_pte(spte, shadow_trap_nonpresent_pte); + __set_spte(spte, shadow_trap_nonpresent_pte); if (is_large_pte(pte)) --vcpu->kvm->stat.lpages; } @@ -2397,11 +2530,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, const void *new) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { - if (!vcpu->arch.update_pte.largepage || - sp->role.glevels == PT32_ROOT_LEVEL) { - ++vcpu->kvm->stat.mmu_pde_zapped; - return; - } + ++vcpu->kvm->stat.mmu_pde_zapped; + return; } ++vcpu->kvm->stat.mmu_pte_updated; @@ -2447,8 +2577,6 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u64 gpte = 0; pfn_t pfn; - vcpu->arch.update_pte.largepage = 0; - if (bytes != 4 && bytes != 8) return; @@ -2472,14 +2600,10 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if ((bytes == 4) && (gpa % 4 == 0)) memcpy((void *)&gpte, new, 4); } - if (!is_present_pte(gpte)) + if (!is_present_gpte(gpte)) return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { - gfn &= ~(KVM_PAGES_PER_HPAGE-1); - vcpu->arch.update_pte.largepage = 1; - } vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); @@ -2622,6 +2746,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) gpa_t gpa; int r; + if (tdp_enabled) + return 0; + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); spin_lock(&vcpu->kvm->mmu_lock); @@ -2633,7 +2760,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { + while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && + !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *sp; sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, @@ -2670,8 +2798,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) ++vcpu->stat.mmio_exits; return 0; case EMULATE_FAIL: - kvm_report_emulation_failure(vcpu, "pagetable"); - return 1; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + return 0; default: BUG(); } @@ -2712,12 +2841,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) ASSERT(vcpu); - if (vcpu->kvm->arch.n_requested_mmu_pages) - vcpu->kvm->arch.n_free_mmu_pages = - vcpu->kvm->arch.n_requested_mmu_pages; - else - vcpu->kvm->arch.n_free_mmu_pages = - vcpu->kvm->arch.n_alloc_mmu_pages; /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. * Therefore we need to allocate shadow page tables in the first @@ -3029,6 +3152,24 @@ out: return r; } +int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) +{ + struct kvm_shadow_walk_iterator iterator; + int nr_sptes = 0; + + spin_lock(&vcpu->kvm->mmu_lock); + for_each_shadow_entry(vcpu, addr, iterator) { + sptes[iterator.level-1] = *iterator.sptep; + nr_sptes++; + if (!is_shadow_present_pte(*iterator.sptep)) + break; + } + spin_unlock(&vcpu->kvm->mmu_lock); + + return nr_sptes; +} +EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); + #ifdef AUDIT static const char *audit_msg; @@ -3041,6 +3182,54 @@ static gva_t canonicalize(gva_t gva) return gva; } + +typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp, + u64 *sptep); + +static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, + inspect_spte_fn fn) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 ent = sp->spt[i]; + + if (is_shadow_present_pte(ent)) { + if (!is_last_spte(ent, sp->role.level)) { + struct kvm_mmu_page *child; + child = page_header(ent & PT64_BASE_ADDR_MASK); + __mmu_spte_walk(kvm, child, fn); + } else + fn(kvm, sp, &sp->spt[i]); + } + } +} + +static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + __mmu_spte_walk(vcpu->kvm, sp, fn); + return; + } + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root && VALID_PAGE(root)) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + __mmu_spte_walk(vcpu->kvm, sp, fn); + } + } + return; +} + static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, gva_t va, int level) { @@ -3055,20 +3244,19 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, continue; va = canonicalize(va); - if (level > 1) { - if (ent == shadow_notrap_nonpresent_pte) - printk(KERN_ERR "audit: (%s) nontrapping pte" - " in nonleaf level: levels %d gva %lx" - " level %d pte %llx\n", audit_msg, - vcpu->arch.mmu.root_level, va, level, ent); - else - audit_mappings_page(vcpu, ent, va, level - 1); - } else { + if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) + audit_mappings_page(vcpu, ent, va, level - 1); + else { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); gfn_t gfn = gpa >> PAGE_SHIFT; pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + continue; + } + if (is_shadow_present_pte(ent) && (ent & PT64_BASE_ADDR_MASK) != hpa) printk(KERN_ERR "xx audit error: (%s) levels %d" @@ -3122,7 +3310,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu) d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); while (d) { for (k = 0; k < RMAP_EXT; ++k) - if (d->shadow_ptes[k]) + if (d->sptes[k]) ++nmaps; else break; @@ -3133,9 +3321,48 @@ static int count_rmaps(struct kvm_vcpu *vcpu) return nmaps; } -static int count_writable_mappings(struct kvm_vcpu *vcpu) +void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) +{ + unsigned long *rmapp; + struct kvm_mmu_page *rev_sp; + gfn_t gfn; + + if (*sptep & PT_WRITABLE_MASK) { + rev_sp = page_header(__pa(sptep)); + gfn = rev_sp->gfns[sptep - rev_sp->spt]; + + if (!gfn_to_memslot(kvm, gfn)) { + if (!printk_ratelimit()) + return; + printk(KERN_ERR "%s: no memslot for gfn %ld\n", + audit_msg, gfn); + printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", + audit_msg, sptep - rev_sp->spt, + rev_sp->gfn); + dump_stack(); + return; + } + + rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], + is_large_pte(*sptep)); + if (!*rmapp) { + if (!printk_ratelimit()) + return; + printk(KERN_ERR "%s: no rmap for writable spte %llx\n", + audit_msg, *sptep); + dump_stack(); + } + } + +} + +void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu) +{ + mmu_spte_walk(vcpu, inspect_spte_has_rmap); +} + +static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) { - int nmaps = 0; struct kvm_mmu_page *sp; int i; @@ -3152,20 +3379,16 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu) continue; if (!(ent & PT_WRITABLE_MASK)) continue; - ++nmaps; + inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]); } } - return nmaps; + return; } static void audit_rmap(struct kvm_vcpu *vcpu) { - int n_rmap = count_rmaps(vcpu); - int n_actual = count_writable_mappings(vcpu); - - if (n_rmap != n_actual) - printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", - __func__, audit_msg, n_rmap, n_actual); + check_writable_mappings_rmap(vcpu); + count_rmaps(vcpu); } static void audit_write_protection(struct kvm_vcpu *vcpu) @@ -3173,20 +3396,28 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) struct kvm_mmu_page *sp; struct kvm_memory_slot *slot; unsigned long *rmapp; + u64 *spte; gfn_t gfn; list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { if (sp->role.direct) continue; + if (sp->unsync) + continue; gfn = unalias_gfn(vcpu->kvm, sp->gfn); slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); rmapp = &slot->rmap[gfn - slot->base_gfn]; - if (*rmapp) - printk(KERN_ERR "%s: (%s) shadow page has writable" - " mappings: gfn %lx role %x\n", + + spte = rmap_next(vcpu->kvm, rmapp, NULL); + while (spte) { + if (*spte & PT_WRITABLE_MASK) + printk(KERN_ERR "%s: (%s) shadow page has " + "writable mappings: gfn %lx role %x\n", __func__, audit_msg, sp->gfn, sp->role.word); + spte = rmap_next(vcpu->kvm, rmapp, spte); + } } } @@ -3198,7 +3429,9 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) audit_msg = msg; audit_rmap(vcpu); audit_write_protection(vcpu); - audit_mappings(vcpu); + if (strcmp("pre pte write", audit_msg) != 0) + audit_mappings(vcpu); + audit_writable_sptes_have_rmaps(vcpu); dbg = olddbg; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 3494a2fb136..61a1b3884b4 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -37,6 +37,8 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 +int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); + static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) @@ -75,7 +77,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return vcpu->arch.cr0 & X86_CR0_PG; } -static inline int is_present_pte(unsigned long pte) +static inline int is_present_gpte(unsigned long pte) { return pte & PT_PRESENT_MASK; } diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h new file mode 100644 index 00000000000..3e4a5c6ca2a --- /dev/null +++ b/arch/x86/kvm/mmutrace.h @@ -0,0 +1,220 @@ +#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVMMMU_H + +#include <linux/tracepoint.h> +#include <linux/ftrace_event.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvmmmu +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE mmutrace + +#define KVM_MMU_PAGE_FIELDS \ + __field(__u64, gfn) \ + __field(__u32, role) \ + __field(__u32, root_count) \ + __field(__u32, unsync) + +#define KVM_MMU_PAGE_ASSIGN(sp) \ + __entry->gfn = sp->gfn; \ + __entry->role = sp->role.word; \ + __entry->root_count = sp->root_count; \ + __entry->unsync = sp->unsync; + +#define KVM_MMU_PAGE_PRINTK() ({ \ + const char *ret = p->buffer + p->len; \ + static const char *access_str[] = { \ + "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \ + }; \ + union kvm_mmu_page_role role; \ + \ + role.word = __entry->role; \ + \ + trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \ + " %snxe root %u %s%c", \ + __entry->gfn, role.level, role.glevels, \ + role.quadrant, \ + role.direct ? " direct" : "", \ + access_str[role.access], \ + role.invalid ? " invalid" : "", \ + role.cr4_pge ? "" : "!", \ + role.nxe ? "" : "!", \ + __entry->root_count, \ + __entry->unsync ? "unsync" : "sync", 0); \ + ret; \ + }) + +#define kvm_mmu_trace_pferr_flags \ + { PFERR_PRESENT_MASK, "P" }, \ + { PFERR_WRITE_MASK, "W" }, \ + { PFERR_USER_MASK, "U" }, \ + { PFERR_RSVD_MASK, "RSVD" }, \ + { PFERR_FETCH_MASK, "F" } + +/* + * A pagetable walk has started + */ +TRACE_EVENT( + kvm_mmu_pagetable_walk, + TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), + TP_ARGS(addr, write_fault, user_fault, fetch_fault), + + TP_STRUCT__entry( + __field(__u64, addr) + __field(__u32, pferr) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) + | (!!fetch_fault << 4); + ), + + TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, + __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) +); + + +/* We just walked a paging element */ +TRACE_EVENT( + kvm_mmu_paging_element, + TP_PROTO(u64 pte, int level), + TP_ARGS(pte, level), + + TP_STRUCT__entry( + __field(__u64, pte) + __field(__u32, level) + ), + + TP_fast_assign( + __entry->pte = pte; + __entry->level = level; + ), + + TP_printk("pte %llx level %u", __entry->pte, __entry->level) +); + +/* We set a pte accessed bit */ +TRACE_EVENT( + kvm_mmu_set_accessed_bit, + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + TP_ARGS(table_gfn, index, size), + + TP_STRUCT__entry( + __field(__u64, gpa) + ), + + TP_fast_assign( + __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) + + index * size; + ), + + TP_printk("gpa %llx", __entry->gpa) +); + +/* We set a pte dirty bit */ +TRACE_EVENT( + kvm_mmu_set_dirty_bit, + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + TP_ARGS(table_gfn, index, size), + + TP_STRUCT__entry( + __field(__u64, gpa) + ), + + TP_fast_assign( + __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) + + index * size; + ), + + TP_printk("gpa %llx", __entry->gpa) +); + +TRACE_EVENT( + kvm_mmu_walker_error, + TP_PROTO(u32 pferr), + TP_ARGS(pferr), + + TP_STRUCT__entry( + __field(__u32, pferr) + ), + + TP_fast_assign( + __entry->pferr = pferr; + ), + + TP_printk("pferr %x %s", __entry->pferr, + __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) +); + +TRACE_EVENT( + kvm_mmu_get_page, + TP_PROTO(struct kvm_mmu_page *sp, bool created), + TP_ARGS(sp, created), + + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + __field(bool, created) + ), + + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + __entry->created = created; + ), + + TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(), + __entry->created ? "new" : "existing") +); + +TRACE_EVENT( + kvm_mmu_sync_page, + TP_PROTO(struct kvm_mmu_page *sp), + TP_ARGS(sp), + + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + ), + + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + ), + + TP_printk("%s", KVM_MMU_PAGE_PRINTK()) +); + +TRACE_EVENT( + kvm_mmu_unsync_page, + TP_PROTO(struct kvm_mmu_page *sp), + TP_ARGS(sp), + + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + ), + + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + ), + + TP_printk("%s", KVM_MMU_PAGE_PRINTK()) +); + +TRACE_EVENT( + kvm_mmu_zap_page, + TP_PROTO(struct kvm_mmu_page *sp), + TP_ARGS(sp), + + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + ), + + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + ), + + TP_printk("%s", KVM_MMU_PAGE_PRINTK()) +); + +#endif /* _TRACE_KVMMMU_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 67785f63539..72558f8ff3f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -27,7 +27,8 @@ #define guest_walker guest_walker64 #define FNAME(name) paging##64_##name #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK - #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) #define PT_LEVEL_BITS PT64_LEVEL_BITS @@ -43,7 +44,8 @@ #define guest_walker guest_walker32 #define FNAME(name) paging##32_##name #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK - #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) #define PT_LEVEL_BITS PT32_LEVEL_BITS @@ -53,8 +55,8 @@ #error Invalid PTTYPE value #endif -#define gpte_to_gfn FNAME(gpte_to_gfn) -#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde) +#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) +#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL) /* * The guest_walker structure emulates the behavior of the hardware page @@ -71,14 +73,9 @@ struct guest_walker { u32 error_code; }; -static gfn_t gpte_to_gfn(pt_element_t gpte) +static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) { - return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; -} - -static gfn_t gpte_to_gfn_pde(pt_element_t gpte) -{ - return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; + return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, @@ -125,14 +122,16 @@ static int FNAME(walk_addr)(struct guest_walker *walker, gpa_t pte_gpa; int rsvd_fault = 0; - pgprintk("%s: addr %lx\n", __func__, addr); + trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, + fetch_fault); walk: walker->level = vcpu->arch.mmu.root_level; pte = vcpu->arch.cr3; #if PTTYPE == 64 if (!is_long_mode(vcpu)) { - pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; - if (!is_present_pte(pte)) + pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); + trace_kvm_mmu_paging_element(pte, walker->level); + if (!is_present_gpte(pte)) goto not_present; --walker->level; } @@ -150,12 +149,11 @@ walk: pte_gpa += index * sizeof(pt_element_t); walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - pgprintk("%s: table_gfn[%d] %lx\n", __func__, - walker->level - 1, table_gfn); kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); + trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_pte(pte)) + if (!is_present_gpte(pte)) goto not_present; rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); @@ -175,6 +173,8 @@ walk: #endif if (!(pte & PT_ACCESSED_MASK)) { + trace_kvm_mmu_set_accessed_bit(table_gfn, index, + sizeof(pte)); mark_page_dirty(vcpu->kvm, table_gfn); if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_ACCESSED_MASK)) @@ -186,18 +186,24 @@ walk: walker->ptes[walker->level - 1] = pte; - if (walker->level == PT_PAGE_TABLE_LEVEL) { - walker->gfn = gpte_to_gfn(pte); - break; - } - - if (walker->level == PT_DIRECTORY_LEVEL - && (pte & PT_PAGE_SIZE_MASK) - && (PTTYPE == 64 || is_pse(vcpu))) { - walker->gfn = gpte_to_gfn_pde(pte); - walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); - if (PTTYPE == 32 && is_cpuid_PSE36()) + if ((walker->level == PT_PAGE_TABLE_LEVEL) || + ((walker->level == PT_DIRECTORY_LEVEL) && + (pte & PT_PAGE_SIZE_MASK) && + (PTTYPE == 64 || is_pse(vcpu))) || + ((walker->level == PT_PDPE_LEVEL) && + (pte & PT_PAGE_SIZE_MASK) && + is_long_mode(vcpu))) { + int lvl = walker->level; + + walker->gfn = gpte_to_gfn_lvl(pte, lvl); + walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) + >> PAGE_SHIFT; + + if (PTTYPE == 32 && + walker->level == PT_DIRECTORY_LEVEL && + is_cpuid_PSE36()) walker->gfn += pse36_gfn_delta(pte); + break; } @@ -205,9 +211,10 @@ walk: --walker->level; } - if (write_fault && !is_dirty_pte(pte)) { + if (write_fault && !is_dirty_gpte(pte)) { bool ret; + trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); mark_page_dirty(vcpu->kvm, table_gfn); ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_DIRTY_MASK); @@ -239,6 +246,7 @@ err: walker->error_code |= PFERR_FETCH_MASK; if (rsvd_fault) walker->error_code |= PFERR_RSVD_MASK; + trace_kvm_mmu_walker_error(walker->error_code); return 0; } @@ -248,12 +256,11 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, pt_element_t gpte; unsigned pte_access; pfn_t pfn; - int largepage = vcpu->arch.update_pte.largepage; gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { - if (!is_present_pte(gpte)) - set_shadow_pte(spte, shadow_notrap_nonpresent_pte); + if (!is_present_gpte(gpte)) + __set_spte(spte, shadow_notrap_nonpresent_pte); return; } pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); @@ -266,9 +273,13 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) return; kvm_get_pfn(pfn); + /* + * we call mmu_set_spte() with reset_host_protection = true beacuse that + * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). + */ mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, largepage, - gpte_to_gfn(gpte), pfn, true); + gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, + gpte_to_gfn(gpte), pfn, true, true); } /* @@ -276,7 +287,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, */ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, - int user_fault, int write_fault, int largepage, + int user_fault, int write_fault, int hlevel, int *ptwrite, pfn_t pfn) { unsigned access = gw->pt_access; @@ -289,20 +300,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, pt_element_t curr_pte; struct kvm_shadow_walk_iterator iterator; - if (!is_present_pte(gw->ptes[gw->level - 1])) + if (!is_present_gpte(gw->ptes[gw->level - 1])) return NULL; for_each_shadow_entry(vcpu, addr, iterator) { level = iterator.level; sptep = iterator.sptep; - if (level == PT_PAGE_TABLE_LEVEL - || (largepage && level == PT_DIRECTORY_LEVEL)) { + if (iterator.level == hlevel) { mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, user_fault, write_fault, gw->ptes[gw->level-1] & PT_DIRTY_MASK, - ptwrite, largepage, - gw->gfn, pfn, false); + ptwrite, level, + gw->gfn, pfn, false, true); break; } @@ -311,16 +321,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (is_large_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); - set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + __set_spte(sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); } - if (level == PT_DIRECTORY_LEVEL - && gw->level == PT_DIRECTORY_LEVEL) { + if (level <= gw->level) { + int delta = level - gw->level + 1; direct = 1; - if (!is_dirty_pte(gw->ptes[level - 1])) + if (!is_dirty_gpte(gw->ptes[level - delta])) access &= ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(gw->ptes[level - 1]); + table_gfn = gpte_to_gfn(gw->ptes[level - delta]); + /* advance table_gfn when emulating 1gb pages with 4k */ + if (delta == 0) + table_gfn += PT_INDEX(addr, level); } else { direct = 0; table_gfn = gw->table_gfn[level - 2]; @@ -369,11 +382,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, int user_fault = error_code & PFERR_USER_MASK; int fetch_fault = error_code & PFERR_FETCH_MASK; struct guest_walker walker; - u64 *shadow_pte; + u64 *sptep; int write_pt = 0; int r; pfn_t pfn; - int largepage = 0; + int level = PT_PAGE_TABLE_LEVEL; unsigned long mmu_seq; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -399,14 +412,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return 0; } - if (walker.level == PT_DIRECTORY_LEVEL) { - gfn_t large_gfn; - large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); - if (is_largepage_backed(vcpu, large_gfn)) { - walker.gfn = large_gfn; - largepage = 1; - } + if (walker.level >= PT_DIRECTORY_LEVEL) { + level = min(walker.level, mapping_level(vcpu, walker.gfn)); + walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); } + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); @@ -422,11 +432,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); - shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - largepage, &write_pt, pfn); - + sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, + level, &write_pt, pfn); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, - shadow_pte, *shadow_pte, write_pt); + sptep, *sptep, write_pt); if (!write_pt) vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ @@ -459,8 +468,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) sptep = iterator.sptep; /* FIXME: properly handle invlpg on large guest pages */ - if (level == PT_PAGE_TABLE_LEVEL || - ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { + if (level == PT_PAGE_TABLE_LEVEL || + ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || + ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { struct kvm_mmu_page *sp = page_header(__pa(sptep)); pte_gpa = (sp->gfn << PAGE_SHIFT); @@ -472,7 +482,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) --vcpu->kvm->stat.lpages; need_flush = 1; } - set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + __set_spte(sptep, shadow_trap_nonpresent_pte); break; } @@ -489,7 +499,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, sizeof(pt_element_t))) return; - if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) { + if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) { if (mmu_topup_memory_caches(vcpu)) return; kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, @@ -536,7 +546,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); for (j = 0; j < ARRAY_SIZE(pt); ++j) - if (r || is_present_pte(pt[j])) + if (r || is_present_gpte(pt[j])) sp->spt[i+j] = shadow_trap_nonpresent_pte; else sp->spt[i+j] = shadow_notrap_nonpresent_pte; @@ -552,6 +562,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { int i, offset, nr_present; + bool reset_host_protection; offset = nr_present = 0; @@ -574,24 +585,31 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) || + if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) { u64 nonpresent; rmap_remove(vcpu->kvm, &sp->spt[i]); - if (is_present_pte(gpte)) + if (is_present_gpte(gpte)) nonpresent = shadow_trap_nonpresent_pte; else nonpresent = shadow_notrap_nonpresent_pte; - set_shadow_pte(&sp->spt[i], nonpresent); + __set_spte(&sp->spt[i], nonpresent); continue; } nr_present++; pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) { + pte_access &= ~ACC_WRITE_MASK; + reset_host_protection = 0; + } else { + reset_host_protection = 1; + } set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, - is_dirty_pte(gpte), 0, gfn, - spte_to_pfn(sp->spt[i]), true, false); + is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, + spte_to_pfn(sp->spt[i]), true, false, + reset_host_protection); } return !nr_present; @@ -603,9 +621,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef PT_BASE_ADDR_MASK #undef PT_INDEX #undef PT_LEVEL_MASK -#undef PT_DIR_BASE_ADDR_MASK +#undef PT_LVL_ADDR_MASK +#undef PT_LVL_OFFSET_MASK #undef PT_LEVEL_BITS #undef PT_MAX_FULL_LEVELS #undef gpte_to_gfn -#undef gpte_to_gfn_pde +#undef gpte_to_gfn_lvl #undef CMPXCHG diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b1f658ad2f0..c17404add91 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -15,7 +15,6 @@ */ #include <linux/kvm_host.h> -#include "kvm_svm.h" #include "irq.h" #include "mmu.h" #include "kvm_cache_regs.h" @@ -26,10 +25,12 @@ #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/sched.h> +#include <linux/ftrace_event.h> #include <asm/desc.h> #include <asm/virtext.h> +#include "trace.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) @@ -46,6 +47,10 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_LBRV (1 << 1) #define SVM_FEATURE_SVML (1 << 2) +#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ +#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ +#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ + #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) /* Turn on to get debugging output*/ @@ -57,6 +62,58 @@ MODULE_LICENSE("GPL"); #define nsvm_printk(fmt, args...) do {} while(0) #endif +static const u32 host_save_user_msrs[] = { +#ifdef CONFIG_X86_64 + MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, + MSR_FS_BASE, +#endif + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, +}; + +#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) + +struct kvm_vcpu; + +struct nested_state { + struct vmcb *hsave; + u64 hsave_msr; + u64 vmcb; + + /* These are the merged vectors */ + u32 *msrpm; + + /* gpa pointers to the real vectors */ + u64 vmcb_msrpm; + + /* cache for intercepts of the guest */ + u16 intercept_cr_read; + u16 intercept_cr_write; + u16 intercept_dr_read; + u16 intercept_dr_write; + u32 intercept_exceptions; + u64 intercept; + +}; + +struct vcpu_svm { + struct kvm_vcpu vcpu; + struct vmcb *vmcb; + unsigned long vmcb_pa; + struct svm_cpu_data *svm_data; + uint64_t asid_generation; + uint64_t sysenter_esp; + uint64_t sysenter_eip; + + u64 next_rip; + + u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; + u64 host_gs_base; + + u32 *msrpm; + + struct nested_state nested; +}; + /* enable NPT for AMD64 and X86 with PAE */ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static bool npt_enabled = true; @@ -67,15 +124,14 @@ static int npt = 1; module_param(npt, int, S_IRUGO); -static int nested = 0; +static int nested = 1; module_param(nested, int, S_IRUGO); static void svm_flush_tlb(struct kvm_vcpu *vcpu); +static void svm_complete_interrupts(struct vcpu_svm *svm); -static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); +static int nested_svm_exit_handled(struct vcpu_svm *svm); static int nested_svm_vmexit(struct vcpu_svm *svm); -static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, - void *arg2, void *opaque); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); @@ -86,7 +142,22 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) static inline bool is_nested(struct vcpu_svm *svm) { - return svm->nested_vmcb; + return svm->nested.vmcb; +} + +static inline void enable_gif(struct vcpu_svm *svm) +{ + svm->vcpu.arch.hflags |= HF_GIF_MASK; +} + +static inline void disable_gif(struct vcpu_svm *svm) +{ + svm->vcpu.arch.hflags &= ~HF_GIF_MASK; +} + +static inline bool gif_set(struct vcpu_svm *svm) +{ + return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); } static unsigned long iopm_base; @@ -147,19 +218,6 @@ static inline void invlpga(unsigned long addr, u32 asid) asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); } -static inline unsigned long kvm_read_cr2(void) -{ - unsigned long cr2; - - asm volatile ("mov %%cr2, %0" : "=r" (cr2)); - return cr2; -} - -static inline void kvm_write_cr2(unsigned long val) -{ - asm volatile ("mov %0, %%cr2" :: "r" (val)); -} - static inline void force_new_asid(struct kvm_vcpu *vcpu) { to_svm(vcpu)->asid_generation--; @@ -263,7 +321,7 @@ static void svm_hardware_enable(void *garbage) struct svm_cpu_data *svm_data; uint64_t efer; - struct desc_ptr gdt_descr; + struct descriptor_table gdt_descr; struct desc_struct *gdt; int me = raw_smp_processor_id(); @@ -283,8 +341,8 @@ static void svm_hardware_enable(void *garbage) svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; svm_data->next_asid = svm_data->max_asid + 1; - asm volatile ("sgdt %0" : "=m"(gdt_descr)); - gdt = (struct desc_struct *)gdt_descr.address; + kvm_get_gdt(&gdt_descr); + gdt = (struct desc_struct *)gdt_descr.base; svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); rdmsrl(MSR_EFER, efer); @@ -367,8 +425,6 @@ static void svm_vcpu_init_msrpm(u32 *msrpm) #endif set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); - set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); } static void svm_enable_lbrv(struct vcpu_svm *svm) @@ -595,8 +651,10 @@ static void init_vmcb(struct vcpu_svm *svm) } force_new_asid(&svm->vcpu); - svm->nested_vmcb = 0; - svm->vcpu.arch.hflags = HF_GIF_MASK; + svm->nested.vmcb = 0; + svm->vcpu.arch.hflags = 0; + + enable_gif(svm); } static int svm_vcpu_reset(struct kvm_vcpu *vcpu) @@ -605,7 +663,7 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) init_vmcb(svm); - if (vcpu->vcpu_id != 0) { + if (!kvm_vcpu_is_bsp(vcpu)) { kvm_rip_write(vcpu, 0); svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; @@ -656,9 +714,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) hsave_page = alloc_page(GFP_KERNEL); if (!hsave_page) goto uninit; - svm->hsave = page_address(hsave_page); + svm->nested.hsave = page_address(hsave_page); - svm->nested_msrpm = page_address(nested_msrpm_pages); + svm->nested.msrpm = page_address(nested_msrpm_pages); svm->vmcb = page_address(page); clear_page(svm->vmcb); @@ -669,7 +727,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) fx_init(&svm->vcpu); svm->vcpu.fpu_active = 1; svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; - if (svm->vcpu.vcpu_id == 0) + if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; return &svm->vcpu; @@ -688,8 +746,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); - __free_page(virt_to_page(svm->hsave)); - __free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER); + __free_page(virt_to_page(svm->nested.hsave)); + __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, svm); } @@ -709,6 +767,8 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) rdtscll(tsc_this); delta = vcpu->arch.host_tsc - tsc_this; svm->vmcb->control.tsc_offset += delta; + if (is_nested(svm)) + svm->nested.hsave->control.tsc_offset += delta; vcpu->cpu = cpu; kvm_migrate_timers(vcpu); svm->asid_generation = 0; @@ -740,6 +800,18 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } +static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +{ + switch (reg) { + case VCPU_EXREG_PDPTR: + BUG_ON(!npt_enabled); + load_pdptrs(vcpu, vcpu->arch.cr3); + break; + default: + BUG(); + } +} + static void svm_set_vintr(struct vcpu_svm *svm) { svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; @@ -1061,7 +1133,6 @@ static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) val = 0; } - KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); return val; } @@ -1070,8 +1141,6 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, { struct vcpu_svm *svm = to_svm(vcpu); - KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler); - *exception = 0; switch (dr) { @@ -1119,25 +1188,9 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) fault_address = svm->vmcb->control.exit_info_2; error_code = svm->vmcb->control.exit_info_1; - if (!npt_enabled) - KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code, - (u32)fault_address, (u32)(fault_address >> 32), - handler); - else - KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code, - (u32)fault_address, (u32)(fault_address >> 32), - handler); - /* - * FIXME: Tis shouldn't be necessary here, but there is a flush - * missing in the MMU code. Until we find this bug, flush the - * complete TLB here on an NPF - */ - if (npt_enabled) - svm_flush_tlb(&svm->vcpu); - else { - if (kvm_event_needs_reinjection(&svm->vcpu)) - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); - } + trace_kvm_page_fault(fault_address, error_code); + if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } @@ -1253,14 +1306,12 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - KVMTRACE_0D(NMI, &svm->vcpu, handler); return 1; } static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { ++svm->vcpu.stat.irq_exits; - KVMTRACE_0D(INTR, &svm->vcpu, handler); return 1; } @@ -1303,44 +1354,39 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code) { - if (is_nested(svm)) { - svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = error_code; - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - if (nested_svm_exit_handled(svm, false)) { - nsvm_printk("VMexit -> EXCP 0x%x\n", nr); - - nested_svm_vmexit(svm); - return 1; - } - } + if (!is_nested(svm)) + return 0; - return 0; + svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = error_code; + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + + return nested_svm_exit_handled(svm); } static inline int nested_svm_intr(struct vcpu_svm *svm) { - if (is_nested(svm)) { - if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) - return 0; + if (!is_nested(svm)) + return 0; - if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) - return 0; + if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) + return 0; - svm->vmcb->control.exit_code = SVM_EXIT_INTR; + if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) + return 0; - if (nested_svm_exit_handled(svm, false)) { - nsvm_printk("VMexit -> INTR\n"); - nested_svm_vmexit(svm); - return 1; - } + svm->vmcb->control.exit_code = SVM_EXIT_INTR; + + if (nested_svm_exit_handled(svm)) { + nsvm_printk("VMexit -> INTR\n"); + return 1; } return 0; } -static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) +static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx) { struct page *page; @@ -1348,236 +1394,246 @@ static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); up_read(¤t->mm->mmap_sem); - if (is_error_page(page)) { - printk(KERN_INFO "%s: could not find page at 0x%llx\n", - __func__, gpa); - kvm_release_page_clean(page); - kvm_inject_gp(&svm->vcpu, 0); - return NULL; - } - return page; + if (is_error_page(page)) + goto error; + + return kmap_atomic(page, idx); + +error: + kvm_release_page_clean(page); + kvm_inject_gp(&svm->vcpu, 0); + + return NULL; } -static int nested_svm_do(struct vcpu_svm *svm, - u64 arg1_gpa, u64 arg2_gpa, void *opaque, - int (*handler)(struct vcpu_svm *svm, - void *arg1, - void *arg2, - void *opaque)) +static void nested_svm_unmap(void *addr, enum km_type idx) { - struct page *arg1_page; - struct page *arg2_page = NULL; - void *arg1; - void *arg2 = NULL; - int retval; + struct page *page; - arg1_page = nested_svm_get_page(svm, arg1_gpa); - if(arg1_page == NULL) - return 1; + if (!addr) + return; - if (arg2_gpa) { - arg2_page = nested_svm_get_page(svm, arg2_gpa); - if(arg2_page == NULL) { - kvm_release_page_clean(arg1_page); - return 1; - } - } + page = kmap_atomic_to_page(addr); - arg1 = kmap_atomic(arg1_page, KM_USER0); - if (arg2_gpa) - arg2 = kmap_atomic(arg2_page, KM_USER1); + kunmap_atomic(addr, idx); + kvm_release_page_dirty(page); +} - retval = handler(svm, arg1, arg2, opaque); +static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) +{ + u32 param = svm->vmcb->control.exit_info_1 & 1; + u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + bool ret = false; + u32 t0, t1; + u8 *msrpm; + + if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) + return false; - kunmap_atomic(arg1, KM_USER0); - if (arg2_gpa) - kunmap_atomic(arg2, KM_USER1); + msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); - kvm_release_page_dirty(arg1_page); - if (arg2_gpa) - kvm_release_page_dirty(arg2_page); + if (!msrpm) + goto out; - return retval; + switch (msr) { + case 0 ... 0x1fff: + t0 = (msr * 2) % 8; + t1 = msr / 8; + break; + case 0xc0000000 ... 0xc0001fff: + t0 = (8192 + msr - 0xc0000000) * 2; + t1 = (t0 / 8); + t0 %= 8; + break; + case 0xc0010000 ... 0xc0011fff: + t0 = (16384 + msr - 0xc0010000) * 2; + t1 = (t0 / 8); + t0 %= 8; + break; + default: + ret = true; + goto out; + } + + ret = msrpm[t1] & ((1 << param) << t0); + +out: + nested_svm_unmap(msrpm, KM_USER0); + + return ret; } -static int nested_svm_exit_handled_real(struct vcpu_svm *svm, - void *arg1, - void *arg2, - void *opaque) +static int nested_svm_exit_special(struct vcpu_svm *svm) { - struct vmcb *nested_vmcb = (struct vmcb *)arg1; - bool kvm_overrides = *(bool *)opaque; u32 exit_code = svm->vmcb->control.exit_code; - if (kvm_overrides) { - switch (exit_code) { - case SVM_EXIT_INTR: - case SVM_EXIT_NMI: - return 0; + switch (exit_code) { + case SVM_EXIT_INTR: + case SVM_EXIT_NMI: + return NESTED_EXIT_HOST; /* For now we are always handling NPFs when using them */ - case SVM_EXIT_NPF: - if (npt_enabled) - return 0; - break; - /* When we're shadowing, trap PFs */ - case SVM_EXIT_EXCP_BASE + PF_VECTOR: - if (!npt_enabled) - return 0; - break; - default: - break; - } + case SVM_EXIT_NPF: + if (npt_enabled) + return NESTED_EXIT_HOST; + break; + /* When we're shadowing, trap PFs */ + case SVM_EXIT_EXCP_BASE + PF_VECTOR: + if (!npt_enabled) + return NESTED_EXIT_HOST; + break; + default: + break; } + return NESTED_EXIT_CONTINUE; +} + +/* + * If this function returns true, this #vmexit was already handled + */ +static int nested_svm_exit_handled(struct vcpu_svm *svm) +{ + u32 exit_code = svm->vmcb->control.exit_code; + int vmexit = NESTED_EXIT_HOST; + switch (exit_code) { + case SVM_EXIT_MSR: + vmexit = nested_svm_exit_handled_msr(svm); + break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); - if (nested_vmcb->control.intercept_cr_read & cr_bits) - return 1; + if (svm->nested.intercept_cr_read & cr_bits) + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); - if (nested_vmcb->control.intercept_cr_write & cr_bits) - return 1; + if (svm->nested.intercept_cr_write & cr_bits) + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); - if (nested_vmcb->control.intercept_dr_read & dr_bits) - return 1; + if (svm->nested.intercept_dr_read & dr_bits) + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); - if (nested_vmcb->control.intercept_dr_write & dr_bits) - return 1; + if (svm->nested.intercept_dr_write & dr_bits) + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (nested_vmcb->control.intercept_exceptions & excp_bits) - return 1; + if (svm->nested.intercept_exceptions & excp_bits) + vmexit = NESTED_EXIT_DONE; break; } default: { u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); nsvm_printk("exit code: 0x%x\n", exit_code); - if (nested_vmcb->control.intercept & exit_bits) - return 1; + if (svm->nested.intercept & exit_bits) + vmexit = NESTED_EXIT_DONE; } } - return 0; -} - -static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, - void *arg1, void *arg2, - void *opaque) -{ - struct vmcb *nested_vmcb = (struct vmcb *)arg1; - u8 *msrpm = (u8 *)arg2; - u32 t0, t1; - u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - u32 param = svm->vmcb->control.exit_info_1 & 1; - - if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT))) - return 0; - - switch(msr) { - case 0 ... 0x1fff: - t0 = (msr * 2) % 8; - t1 = msr / 8; - break; - case 0xc0000000 ... 0xc0001fff: - t0 = (8192 + msr - 0xc0000000) * 2; - t1 = (t0 / 8); - t0 %= 8; - break; - case 0xc0010000 ... 0xc0011fff: - t0 = (16384 + msr - 0xc0010000) * 2; - t1 = (t0 / 8); - t0 %= 8; - break; - default: - return 1; - break; + if (vmexit == NESTED_EXIT_DONE) { + nsvm_printk("#VMEXIT reason=%04x\n", exit_code); + nested_svm_vmexit(svm); } - if (msrpm[t1] & ((1 << param) << t0)) - return 1; - return 0; + return vmexit; +} + +static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) +{ + struct vmcb_control_area *dst = &dst_vmcb->control; + struct vmcb_control_area *from = &from_vmcb->control; + + dst->intercept_cr_read = from->intercept_cr_read; + dst->intercept_cr_write = from->intercept_cr_write; + dst->intercept_dr_read = from->intercept_dr_read; + dst->intercept_dr_write = from->intercept_dr_write; + dst->intercept_exceptions = from->intercept_exceptions; + dst->intercept = from->intercept; + dst->iopm_base_pa = from->iopm_base_pa; + dst->msrpm_base_pa = from->msrpm_base_pa; + dst->tsc_offset = from->tsc_offset; + dst->asid = from->asid; + dst->tlb_ctl = from->tlb_ctl; + dst->int_ctl = from->int_ctl; + dst->int_vector = from->int_vector; + dst->int_state = from->int_state; + dst->exit_code = from->exit_code; + dst->exit_code_hi = from->exit_code_hi; + dst->exit_info_1 = from->exit_info_1; + dst->exit_info_2 = from->exit_info_2; + dst->exit_int_info = from->exit_int_info; + dst->exit_int_info_err = from->exit_int_info_err; + dst->nested_ctl = from->nested_ctl; + dst->event_inj = from->event_inj; + dst->event_inj_err = from->event_inj_err; + dst->nested_cr3 = from->nested_cr3; + dst->lbr_ctl = from->lbr_ctl; } -static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) +static int nested_svm_vmexit(struct vcpu_svm *svm) { - bool k = kvm_override; - - switch (svm->vmcb->control.exit_code) { - case SVM_EXIT_MSR: - return nested_svm_do(svm, svm->nested_vmcb, - svm->nested_vmcb_msrpm, NULL, - nested_svm_exit_handled_msr); - default: break; - } + struct vmcb *nested_vmcb; + struct vmcb *hsave = svm->nested.hsave; + struct vmcb *vmcb = svm->vmcb; - return nested_svm_do(svm, svm->nested_vmcb, 0, &k, - nested_svm_exit_handled_real); -} - -static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, - void *arg2, void *opaque) -{ - struct vmcb *nested_vmcb = (struct vmcb *)arg1; - struct vmcb *hsave = svm->hsave; - u64 nested_save[] = { nested_vmcb->save.cr0, - nested_vmcb->save.cr3, - nested_vmcb->save.cr4, - nested_vmcb->save.efer, - nested_vmcb->control.intercept_cr_read, - nested_vmcb->control.intercept_cr_write, - nested_vmcb->control.intercept_dr_read, - nested_vmcb->control.intercept_dr_write, - nested_vmcb->control.intercept_exceptions, - nested_vmcb->control.intercept, - nested_vmcb->control.msrpm_base_pa, - nested_vmcb->control.iopm_base_pa, - nested_vmcb->control.tsc_offset }; + nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); + if (!nested_vmcb) + return 1; /* Give the current vmcb to the guest */ - memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb)); - nested_vmcb->save.cr0 = nested_save[0]; - if (!npt_enabled) - nested_vmcb->save.cr3 = nested_save[1]; - nested_vmcb->save.cr4 = nested_save[2]; - nested_vmcb->save.efer = nested_save[3]; - nested_vmcb->control.intercept_cr_read = nested_save[4]; - nested_vmcb->control.intercept_cr_write = nested_save[5]; - nested_vmcb->control.intercept_dr_read = nested_save[6]; - nested_vmcb->control.intercept_dr_write = nested_save[7]; - nested_vmcb->control.intercept_exceptions = nested_save[8]; - nested_vmcb->control.intercept = nested_save[9]; - nested_vmcb->control.msrpm_base_pa = nested_save[10]; - nested_vmcb->control.iopm_base_pa = nested_save[11]; - nested_vmcb->control.tsc_offset = nested_save[12]; + disable_gif(svm); + + nested_vmcb->save.es = vmcb->save.es; + nested_vmcb->save.cs = vmcb->save.cs; + nested_vmcb->save.ss = vmcb->save.ss; + nested_vmcb->save.ds = vmcb->save.ds; + nested_vmcb->save.gdtr = vmcb->save.gdtr; + nested_vmcb->save.idtr = vmcb->save.idtr; + if (npt_enabled) + nested_vmcb->save.cr3 = vmcb->save.cr3; + nested_vmcb->save.cr2 = vmcb->save.cr2; + nested_vmcb->save.rflags = vmcb->save.rflags; + nested_vmcb->save.rip = vmcb->save.rip; + nested_vmcb->save.rsp = vmcb->save.rsp; + nested_vmcb->save.rax = vmcb->save.rax; + nested_vmcb->save.dr7 = vmcb->save.dr7; + nested_vmcb->save.dr6 = vmcb->save.dr6; + nested_vmcb->save.cpl = vmcb->save.cpl; + + nested_vmcb->control.int_ctl = vmcb->control.int_ctl; + nested_vmcb->control.int_vector = vmcb->control.int_vector; + nested_vmcb->control.int_state = vmcb->control.int_state; + nested_vmcb->control.exit_code = vmcb->control.exit_code; + nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; + nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; + nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; + nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; + nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; + nested_vmcb->control.tlb_ctl = 0; + nested_vmcb->control.event_inj = 0; + nested_vmcb->control.event_inj_err = 0; /* We always set V_INTR_MASKING and remember the old value in hflags */ if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; - if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) && - (nested_vmcb->control.int_vector)) { - nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n", - nested_vmcb->control.int_vector); - } - /* Restore the original control entries */ - svm->vmcb->control = hsave->control; + copy_vmcb_control_area(vmcb, hsave); /* Kill any pending exceptions */ if (svm->vcpu.arch.exception.pending == true) nsvm_printk("WARNING: Pending Exception\n"); - svm->vcpu.arch.exception.pending = false; + + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); /* Restore selected save entries */ svm->vmcb->save.es = hsave->save.es; @@ -1603,19 +1659,10 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, svm->vmcb->save.cpl = 0; svm->vmcb->control.exit_int_info = 0; - svm->vcpu.arch.hflags &= ~HF_GIF_MASK; /* Exit nested SVM mode */ - svm->nested_vmcb = 0; - - return 0; -} + svm->nested.vmcb = 0; -static int nested_svm_vmexit(struct vcpu_svm *svm) -{ - nsvm_printk("VMexit\n"); - if (nested_svm_do(svm, svm->nested_vmcb, 0, - NULL, nested_svm_vmexit_real)) - return 1; + nested_svm_unmap(nested_vmcb, KM_USER0); kvm_mmu_reset_context(&svm->vcpu); kvm_mmu_load(&svm->vcpu); @@ -1623,38 +1670,63 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) return 0; } -static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1, - void *arg2, void *opaque) +static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { + u32 *nested_msrpm; int i; - u32 *nested_msrpm = (u32*)arg1; + + nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); + if (!nested_msrpm) + return false; + for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) - svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; - svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm); + svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; - return 0; + svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); + + nested_svm_unmap(nested_msrpm, KM_USER0); + + return true; } -static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, - void *arg2, void *opaque) +static bool nested_svm_vmrun(struct vcpu_svm *svm) { - struct vmcb *nested_vmcb = (struct vmcb *)arg1; - struct vmcb *hsave = svm->hsave; + struct vmcb *nested_vmcb; + struct vmcb *hsave = svm->nested.hsave; + struct vmcb *vmcb = svm->vmcb; + + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); + if (!nested_vmcb) + return false; /* nested_vmcb is our indicator if nested SVM is activated */ - svm->nested_vmcb = svm->vmcb->save.rax; + svm->nested.vmcb = svm->vmcb->save.rax; /* Clear internal status */ - svm->vcpu.arch.exception.pending = false; + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); /* Save the old vmcb, so we don't need to pick what we save, but can restore everything when a VMEXIT occurs */ - memcpy(hsave, svm->vmcb, sizeof(struct vmcb)); - /* We need to remember the original CR3 in the SPT case */ - if (!npt_enabled) - hsave->save.cr3 = svm->vcpu.arch.cr3; - hsave->save.cr4 = svm->vcpu.arch.cr4; - hsave->save.rip = svm->next_rip; + hsave->save.es = vmcb->save.es; + hsave->save.cs = vmcb->save.cs; + hsave->save.ss = vmcb->save.ss; + hsave->save.ds = vmcb->save.ds; + hsave->save.gdtr = vmcb->save.gdtr; + hsave->save.idtr = vmcb->save.idtr; + hsave->save.efer = svm->vcpu.arch.shadow_efer; + hsave->save.cr0 = svm->vcpu.arch.cr0; + hsave->save.cr4 = svm->vcpu.arch.cr4; + hsave->save.rflags = vmcb->save.rflags; + hsave->save.rip = svm->next_rip; + hsave->save.rsp = vmcb->save.rsp; + hsave->save.rax = vmcb->save.rax; + if (npt_enabled) + hsave->save.cr3 = vmcb->save.cr3; + else + hsave->save.cr3 = svm->vcpu.arch.cr3; + + copy_vmcb_control_area(hsave, vmcb); if (svm->vmcb->save.rflags & X86_EFLAGS_IF) svm->vcpu.arch.hflags |= HF_HIF_MASK; @@ -1679,7 +1751,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); kvm_mmu_reset_context(&svm->vcpu); } - svm->vmcb->save.cr2 = nested_vmcb->save.cr2; + svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); @@ -1706,7 +1778,15 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, svm->vmcb->control.intercept |= nested_vmcb->control.intercept; - svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; + svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; + + /* cache intercepts */ + svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; + svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write; + svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read; + svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write; + svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; + svm->nested.intercept = nested_vmcb->control.intercept; force_new_asid(&svm->vcpu); svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info; @@ -1734,12 +1814,14 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; - svm->vcpu.arch.hflags |= HF_GIF_MASK; + nested_svm_unmap(nested_vmcb, KM_USER0); - return 0; + enable_gif(svm); + + return true; } -static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) +static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) { to_vmcb->save.fs = from_vmcb->save.fs; to_vmcb->save.gs = from_vmcb->save.gs; @@ -1753,44 +1835,44 @@ static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; - - return 1; -} - -static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb, - void *arg2, void *opaque) -{ - return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb); -} - -static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, - void *arg2, void *opaque) -{ - return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb); } static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { + struct vmcb *nested_vmcb; + if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); + if (!nested_vmcb) + return 1; + + nested_svm_vmloadsave(nested_vmcb, svm->vmcb); + nested_svm_unmap(nested_vmcb, KM_USER0); return 1; } static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { + struct vmcb *nested_vmcb; + if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); + if (!nested_vmcb) + return 1; + + nested_svm_vmloadsave(svm->vmcb, nested_vmcb); + nested_svm_unmap(nested_vmcb, KM_USER0); return 1; } @@ -1798,19 +1880,29 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { nsvm_printk("VMrun\n"); + if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - if (nested_svm_do(svm, svm->vmcb->save.rax, 0, - NULL, nested_svm_vmrun)) + if (!nested_svm_vmrun(svm)) return 1; - if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0, - NULL, nested_svm_vmrun_msrpm)) - return 1; + if (!nested_svm_vmrun_msrpm(svm)) + goto failed; + + return 1; + +failed: + + svm->vmcb->control.exit_code = SVM_EXIT_ERR; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); return 1; } @@ -1823,7 +1915,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - svm->vcpu.arch.hflags |= HF_GIF_MASK; + enable_gif(svm); return 1; } @@ -1836,7 +1928,7 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - svm->vcpu.arch.hflags &= ~HF_GIF_MASK; + disable_gif(svm); /* After a CLGI no interrupts should come */ svm_clear_vintr(svm); @@ -1845,6 +1937,19 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } +static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + nsvm_printk("INVLPGA\n"); + + /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ + kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + return 1; +} + static int invalid_op_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { @@ -1953,11 +2058,15 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) struct vcpu_svm *svm = to_svm(vcpu); switch (ecx) { - case MSR_IA32_TIME_STAMP_COUNTER: { - u64 tsc; + case MSR_IA32_TSC: { + u64 tsc_offset; + + if (is_nested(svm)) + tsc_offset = svm->nested.hsave->control.tsc_offset; + else + tsc_offset = svm->vmcb->control.tsc_offset; - rdtscll(tsc); - *data = svm->vmcb->control.tsc_offset + tsc; + *data = tsc_offset + native_read_tsc(); break; } case MSR_K6_STAR: @@ -1981,10 +2090,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) *data = svm->vmcb->save.sysenter_cs; break; case MSR_IA32_SYSENTER_EIP: - *data = svm->vmcb->save.sysenter_eip; + *data = svm->sysenter_eip; break; case MSR_IA32_SYSENTER_ESP: - *data = svm->vmcb->save.sysenter_esp; + *data = svm->sysenter_esp; break; /* Nobody will change the following 5 values in the VMCB so we can safely return them on rdmsr. They will always be 0 @@ -2005,7 +2114,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) *data = svm->vmcb->save.last_excp_to; break; case MSR_VM_HSAVE_PA: - *data = svm->hsave_msr; + *data = svm->nested.hsave_msr; break; case MSR_VM_CR: *data = 0; @@ -2027,8 +2136,7 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) if (svm_get_msr(&svm->vcpu, ecx, &data)) kvm_inject_gp(&svm->vcpu, 0); else { - KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, - (u32)(data >> 32), handler); + trace_kvm_msr_read(ecx, data); svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; @@ -2043,11 +2151,18 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) struct vcpu_svm *svm = to_svm(vcpu); switch (ecx) { - case MSR_IA32_TIME_STAMP_COUNTER: { - u64 tsc; + case MSR_IA32_TSC: { + u64 tsc_offset = data - native_read_tsc(); + u64 g_tsc_offset = 0; + + if (is_nested(svm)) { + g_tsc_offset = svm->vmcb->control.tsc_offset - + svm->nested.hsave->control.tsc_offset; + svm->nested.hsave->control.tsc_offset = tsc_offset; + } + + svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset; - rdtscll(tsc); - svm->vmcb->control.tsc_offset = data - tsc; break; } case MSR_K6_STAR: @@ -2071,9 +2186,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) svm->vmcb->save.sysenter_cs = data; break; case MSR_IA32_SYSENTER_EIP: + svm->sysenter_eip = data; svm->vmcb->save.sysenter_eip = data; break; case MSR_IA32_SYSENTER_ESP: + svm->sysenter_esp = data; svm->vmcb->save.sysenter_esp = data; break; case MSR_IA32_DEBUGCTLMSR: @@ -2091,24 +2208,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) else svm_disable_lbrv(svm); break; - case MSR_K7_EVNTSEL0: - case MSR_K7_EVNTSEL1: - case MSR_K7_EVNTSEL2: - case MSR_K7_EVNTSEL3: - case MSR_K7_PERFCTR0: - case MSR_K7_PERFCTR1: - case MSR_K7_PERFCTR2: - case MSR_K7_PERFCTR3: - /* - * Just discard all writes to the performance counters; this - * should keep both older linux and windows 64-bit guests - * happy - */ - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data); - - break; case MSR_VM_HSAVE_PA: - svm->hsave_msr = data; + svm->nested.hsave_msr = data; + break; + case MSR_VM_CR: + case MSR_VM_IGNNE: + pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; default: return kvm_set_msr_common(vcpu, ecx, data); @@ -2122,8 +2227,7 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); - KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), - handler); + trace_kvm_msr_write(ecx, data); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; if (svm_set_msr(&svm->vcpu, ecx, data)) @@ -2144,8 +2248,6 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int interrupt_window_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler); - svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; /* @@ -2201,7 +2303,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_INVD] = emulate_on_interception, [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, - [SVM_EXIT_INVLPGA] = invalid_op_interception, + [SVM_EXIT_INVLPGA] = invlpga_interception, [SVM_EXIT_IOIO] = io_interception, [SVM_EXIT_MSR] = msr_interception, [SVM_EXIT_TASK_SWITCH] = task_switch_interception, @@ -2224,20 +2326,26 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u32 exit_code = svm->vmcb->control.exit_code; - KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip, - (u32)((u64)svm->vmcb->save.rip >> 32), entryexit); + trace_kvm_exit(exit_code, svm->vmcb->save.rip); if (is_nested(svm)) { + int vmexit; + nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", exit_code, svm->vmcb->control.exit_info_1, svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); - if (nested_svm_exit_handled(svm, true)) { - nested_svm_vmexit(svm); - nsvm_printk("-> #VMEXIT\n"); + + vmexit = nested_svm_exit_special(svm); + + if (vmexit == NESTED_EXIT_CONTINUE) + vmexit = nested_svm_exit_handled(svm); + + if (vmexit == NESTED_EXIT_DONE) return 1; - } } + svm_complete_interrupts(svm); + if (npt_enabled) { int mmu_reload = 0; if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { @@ -2246,12 +2354,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } vcpu->arch.cr0 = svm->vmcb->save.cr0; vcpu->arch.cr3 = svm->vmcb->save.cr3; - if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - } if (mmu_reload) { kvm_mmu_reset_context(vcpu); kvm_mmu_load(vcpu); @@ -2319,7 +2421,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { struct vmcb_control_area *control; - KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); + trace_kvm_inj_virq(irq); ++svm->vcpu.stat.irq_injections; control = &svm->vmcb->control; @@ -2329,21 +2431,14 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); } -static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.event_inj = nr | - SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; -} - static void svm_set_irq(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - nested_svm_intr(svm); + BUG_ON(!(gif_set(svm))); - svm_queue_irq(vcpu, vcpu->arch.interrupt.nr); + svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | + SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) @@ -2371,13 +2466,25 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) struct vmcb *vmcb = svm->vmcb; return (vmcb->save.rflags & X86_EFLAGS_IF) && !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && - (svm->vcpu.arch.hflags & HF_GIF_MASK); + gif_set(svm) && + !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)); } static void enable_irq_window(struct kvm_vcpu *vcpu) { - svm_set_vintr(to_svm(vcpu)); - svm_inject_irq(to_svm(vcpu), 0x0); + struct vcpu_svm *svm = to_svm(vcpu); + nsvm_printk("Trying to open IRQ window\n"); + + nested_svm_intr(svm); + + /* In case GIF=0 we can't rely on the CPU to tell us when + * GIF becomes 1, because that's a separate STGI/VMRUN intercept. + * The next time we get that intercept, this function will be + * called again though and we'll get the vintr intercept. */ + if (gif_set(svm)) { + svm_set_vintr(svm); + svm_inject_irq(svm, 0x0); + } } static void enable_nmi_window(struct kvm_vcpu *vcpu) @@ -2456,6 +2563,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) case SVM_EXITINTINFO_TYPE_EXEPT: /* In case of software exception do not reinject an exception vector, but re-execute and instruction instead */ + if (is_nested(svm)) + break; if (kvm_exception_is_soft(vector)) break; if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { @@ -2498,9 +2607,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) fs_selector = kvm_read_fs(); gs_selector = kvm_read_gs(); ldt_selector = kvm_read_ldt(); - svm->host_cr2 = kvm_read_cr2(); - if (!is_nested(svm)) - svm->vmcb->save.cr2 = vcpu->arch.cr2; + svm->vmcb->save.cr2 = vcpu->arch.cr2; /* required for live migration with NPT */ if (npt_enabled) svm->vmcb->save.cr3 = vcpu->arch.cr3; @@ -2585,8 +2692,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - kvm_write_cr2(svm->host_cr2); - kvm_load_fs(fs_selector); kvm_load_gs(gs_selector); kvm_load_ldt(ldt_selector); @@ -2602,7 +2707,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) svm->next_rip = 0; - svm_complete_interrupts(svm); + if (npt_enabled) { + vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); + vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); + } } #undef R @@ -2673,6 +2781,64 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return 0; } +static const struct trace_print_flags svm_exit_reasons_str[] = { + { SVM_EXIT_READ_CR0, "read_cr0" }, + { SVM_EXIT_READ_CR3, "read_cr3" }, + { SVM_EXIT_READ_CR4, "read_cr4" }, + { SVM_EXIT_READ_CR8, "read_cr8" }, + { SVM_EXIT_WRITE_CR0, "write_cr0" }, + { SVM_EXIT_WRITE_CR3, "write_cr3" }, + { SVM_EXIT_WRITE_CR4, "write_cr4" }, + { SVM_EXIT_WRITE_CR8, "write_cr8" }, + { SVM_EXIT_READ_DR0, "read_dr0" }, + { SVM_EXIT_READ_DR1, "read_dr1" }, + { SVM_EXIT_READ_DR2, "read_dr2" }, + { SVM_EXIT_READ_DR3, "read_dr3" }, + { SVM_EXIT_WRITE_DR0, "write_dr0" }, + { SVM_EXIT_WRITE_DR1, "write_dr1" }, + { SVM_EXIT_WRITE_DR2, "write_dr2" }, + { SVM_EXIT_WRITE_DR3, "write_dr3" }, + { SVM_EXIT_WRITE_DR5, "write_dr5" }, + { SVM_EXIT_WRITE_DR7, "write_dr7" }, + { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, + { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, + { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, + { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, + { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, + { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, + { SVM_EXIT_INTR, "interrupt" }, + { SVM_EXIT_NMI, "nmi" }, + { SVM_EXIT_SMI, "smi" }, + { SVM_EXIT_INIT, "init" }, + { SVM_EXIT_VINTR, "vintr" }, + { SVM_EXIT_CPUID, "cpuid" }, + { SVM_EXIT_INVD, "invd" }, + { SVM_EXIT_HLT, "hlt" }, + { SVM_EXIT_INVLPG, "invlpg" }, + { SVM_EXIT_INVLPGA, "invlpga" }, + { SVM_EXIT_IOIO, "io" }, + { SVM_EXIT_MSR, "msr" }, + { SVM_EXIT_TASK_SWITCH, "task_switch" }, + { SVM_EXIT_SHUTDOWN, "shutdown" }, + { SVM_EXIT_VMRUN, "vmrun" }, + { SVM_EXIT_VMMCALL, "hypercall" }, + { SVM_EXIT_VMLOAD, "vmload" }, + { SVM_EXIT_VMSAVE, "vmsave" }, + { SVM_EXIT_STGI, "stgi" }, + { SVM_EXIT_CLGI, "clgi" }, + { SVM_EXIT_SKINIT, "skinit" }, + { SVM_EXIT_WBINVD, "wbinvd" }, + { SVM_EXIT_MONITOR, "monitor" }, + { SVM_EXIT_MWAIT, "mwait" }, + { SVM_EXIT_NPF, "npf" }, + { -1, NULL } +}; + +static bool svm_gb_page_enable(void) +{ + return true; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -2710,6 +2876,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_gdt = svm_set_gdt, .get_dr = svm_get_dr, .set_dr = svm_set_dr, + .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, @@ -2733,6 +2900,9 @@ static struct kvm_x86_ops svm_x86_ops = { .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, .get_mt_mask = svm_get_mt_mask, + + .exit_reasons_str = svm_exit_reasons_str, + .gb_page_enable = svm_gb_page_enable, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index 86dbac072d0..eea40439066 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -9,12 +9,16 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) int restart_timer = 0; wait_queue_head_t *q = &vcpu->wq; - /* FIXME: this code should not know anything about vcpus */ - if (!atomic_inc_and_test(&ktimer->pending)) + /* + * There is a race window between reading and incrementing, but we do + * not care about potentially loosing timer events in the !reinject + * case anyway. + */ + if (ktimer->reinject || !atomic_read(&ktimer->pending)) { + atomic_inc(&ktimer->pending); + /* FIXME: this code should not know anything about vcpus */ set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); - - if (!ktimer->reinject) - atomic_set(&ktimer->pending, 1); + } if (waitqueue_active(q)) wake_up_interruptible(q); @@ -33,7 +37,7 @@ enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) struct kvm_vcpu *vcpu; struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); - vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id]; + vcpu = ktimer->vcpu; if (!vcpu) return HRTIMER_NORESTART; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h new file mode 100644 index 00000000000..0d480e77eac --- /dev/null +++ b/arch/x86/kvm/trace.h @@ -0,0 +1,355 @@ +#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm +#define TRACE_INCLUDE_PATH arch/x86/kvm +#define TRACE_INCLUDE_FILE trace + +/* + * Tracepoint for guest mode entry. + */ +TRACE_EVENT(kvm_entry, + TP_PROTO(unsigned int vcpu_id), + TP_ARGS(vcpu_id), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + ), + + TP_printk("vcpu %u", __entry->vcpu_id) +); + +/* + * Tracepoint for hypercall. + */ +TRACE_EVENT(kvm_hypercall, + TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3), + TP_ARGS(nr, a0, a1, a2, a3), + + TP_STRUCT__entry( + __field( unsigned long, nr ) + __field( unsigned long, a0 ) + __field( unsigned long, a1 ) + __field( unsigned long, a2 ) + __field( unsigned long, a3 ) + ), + + TP_fast_assign( + __entry->nr = nr; + __entry->a0 = a0; + __entry->a1 = a1; + __entry->a2 = a2; + __entry->a3 = a3; + ), + + TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx", + __entry->nr, __entry->a0, __entry->a1, __entry->a2, + __entry->a3) +); + +/* + * Tracepoint for PIO. + */ +TRACE_EVENT(kvm_pio, + TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, + unsigned int count), + TP_ARGS(rw, port, size, count), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, port ) + __field( unsigned int, size ) + __field( unsigned int, count ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->port = port; + __entry->size = size; + __entry->count = count; + ), + + TP_printk("pio_%s at 0x%x size %d count %d", + __entry->rw ? "write" : "read", + __entry->port, __entry->size, __entry->count) +); + +/* + * Tracepoint for cpuid. + */ +TRACE_EVENT(kvm_cpuid, + TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx, + unsigned long rcx, unsigned long rdx), + TP_ARGS(function, rax, rbx, rcx, rdx), + + TP_STRUCT__entry( + __field( unsigned int, function ) + __field( unsigned long, rax ) + __field( unsigned long, rbx ) + __field( unsigned long, rcx ) + __field( unsigned long, rdx ) + ), + + TP_fast_assign( + __entry->function = function; + __entry->rax = rax; + __entry->rbx = rbx; + __entry->rcx = rcx; + __entry->rdx = rdx; + ), + + TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx", + __entry->function, __entry->rax, + __entry->rbx, __entry->rcx, __entry->rdx) +); + +#define AREG(x) { APIC_##x, "APIC_" #x } + +#define kvm_trace_symbol_apic \ + AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \ + AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \ + AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \ + AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \ + AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \ + AREG(ECTRL) +/* + * Tracepoint for apic access. + */ +TRACE_EVENT(kvm_apic, + TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val), + TP_ARGS(rw, reg, val), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, reg ) + __field( unsigned int, val ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->reg = reg; + __entry->val = val; + ), + + TP_printk("apic_%s %s = 0x%x", + __entry->rw ? "write" : "read", + __print_symbolic(__entry->reg, kvm_trace_symbol_apic), + __entry->val) +); + +#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) +#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) + +/* + * Tracepoint for kvm guest exit: + */ +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_reason, unsigned long guest_rip), + TP_ARGS(exit_reason, guest_rip), + + TP_STRUCT__entry( + __field( unsigned int, exit_reason ) + __field( unsigned long, guest_rip ) + ), + + TP_fast_assign( + __entry->exit_reason = exit_reason; + __entry->guest_rip = guest_rip; + ), + + TP_printk("reason %s rip 0x%lx", + ftrace_print_symbols_seq(p, __entry->exit_reason, + kvm_x86_ops->exit_reasons_str), + __entry->guest_rip) +); + +/* + * Tracepoint for kvm interrupt injection: + */ +TRACE_EVENT(kvm_inj_virq, + TP_PROTO(unsigned int irq), + TP_ARGS(irq), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + ), + + TP_fast_assign( + __entry->irq = irq; + ), + + TP_printk("irq %u", __entry->irq) +); + +/* + * Tracepoint for page fault. + */ +TRACE_EVENT(kvm_page_fault, + TP_PROTO(unsigned long fault_address, unsigned int error_code), + TP_ARGS(fault_address, error_code), + + TP_STRUCT__entry( + __field( unsigned long, fault_address ) + __field( unsigned int, error_code ) + ), + + TP_fast_assign( + __entry->fault_address = fault_address; + __entry->error_code = error_code; + ), + + TP_printk("address %lx error_code %x", + __entry->fault_address, __entry->error_code) +); + +/* + * Tracepoint for guest MSR access. + */ +TRACE_EVENT(kvm_msr, + TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), + TP_ARGS(rw, ecx, data), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, ecx ) + __field( unsigned long, data ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->ecx = ecx; + __entry->data = data; + ), + + TP_printk("msr_%s %x = 0x%lx", + __entry->rw ? "write" : "read", + __entry->ecx, __entry->data) +); + +#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) +#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) + +/* + * Tracepoint for guest CR access. + */ +TRACE_EVENT(kvm_cr, + TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val), + TP_ARGS(rw, cr, val), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, cr ) + __field( unsigned long, val ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->cr = cr; + __entry->val = val; + ), + + TP_printk("cr_%s %x = 0x%lx", + __entry->rw ? "write" : "read", + __entry->cr, __entry->val) +); + +#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val) +#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val) + +TRACE_EVENT(kvm_pic_set_irq, + TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced), + TP_ARGS(chip, pin, elcr, imr, coalesced), + + TP_STRUCT__entry( + __field( __u8, chip ) + __field( __u8, pin ) + __field( __u8, elcr ) + __field( __u8, imr ) + __field( bool, coalesced ) + ), + + TP_fast_assign( + __entry->chip = chip; + __entry->pin = pin; + __entry->elcr = elcr; + __entry->imr = imr; + __entry->coalesced = coalesced; + ), + + TP_printk("chip %u pin %u (%s%s)%s", + __entry->chip, __entry->pin, + (__entry->elcr & (1 << __entry->pin)) ? "level":"edge", + (__entry->imr & (1 << __entry->pin)) ? "|masked":"", + __entry->coalesced ? " (coalesced)" : "") +); + +#define kvm_apic_dst_shorthand \ + {0x0, "dst"}, \ + {0x1, "self"}, \ + {0x2, "all"}, \ + {0x3, "all-but-self"} + +TRACE_EVENT(kvm_apic_ipi, + TP_PROTO(__u32 icr_low, __u32 dest_id), + TP_ARGS(icr_low, dest_id), + + TP_STRUCT__entry( + __field( __u32, icr_low ) + __field( __u32, dest_id ) + ), + + TP_fast_assign( + __entry->icr_low = icr_low; + __entry->dest_id = dest_id; + ), + + TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)", + __entry->dest_id, (u8)__entry->icr_low, + __print_symbolic((__entry->icr_low >> 8 & 0x7), + kvm_deliver_mode), + (__entry->icr_low & (1<<11)) ? "logical" : "physical", + (__entry->icr_low & (1<<14)) ? "assert" : "de-assert", + (__entry->icr_low & (1<<15)) ? "level" : "edge", + __print_symbolic((__entry->icr_low >> 18 & 0x3), + kvm_apic_dst_shorthand)) +); + +TRACE_EVENT(kvm_apic_accept_irq, + TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced), + TP_ARGS(apicid, dm, tm, vec, coalesced), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( __u16, dm ) + __field( __u8, tm ) + __field( __u8, vec ) + __field( bool, coalesced ) + ), + + TP_fast_assign( + __entry->apicid = apicid; + __entry->dm = dm; + __entry->tm = tm; + __entry->vec = vec; + __entry->coalesced = coalesced; + ), + + TP_printk("apicid %x vec %u (%s|%s)%s", + __entry->apicid, __entry->vec, + __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode), + __entry->tm ? "level" : "edge", + __entry->coalesced ? " (coalesced)" : "") +); + +#endif /* _TRACE_KVM_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 29f912927a5..ed53b42caba 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -25,6 +25,7 @@ #include <linux/highmem.h> #include <linux/sched.h> #include <linux/moduleparam.h> +#include <linux/ftrace_event.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -34,6 +35,8 @@ #include <asm/virtext.h> #include <asm/mce.h> +#include "trace.h" + #define __ex(x) __kvm_handle_fault_on_reboot(x) MODULE_AUTHOR("Qumranet"); @@ -51,6 +54,10 @@ module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); static int __read_mostly enable_ept = 1; module_param_named(ept, enable_ept, bool, S_IRUGO); +static int __read_mostly enable_unrestricted_guest = 1; +module_param_named(unrestricted_guest, + enable_unrestricted_guest, bool, S_IRUGO); + static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); @@ -84,6 +91,14 @@ struct vcpu_vmx { int guest_efer_loaded; } host_state; struct { + int vm86_active; + u8 save_iopl; + struct kvm_save_segment { + u16 selector; + unsigned long base; + u32 limit; + u32 ar; + } tr, es, ds, fs, gs; struct { bool pending; u8 vector; @@ -161,6 +176,8 @@ static struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; +static void ept_save_pdptrs(struct kvm_vcpu *vcpu); + /* * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it * away by decrementing the array size. @@ -256,6 +273,26 @@ static inline bool cpu_has_vmx_flexpriority(void) cpu_has_vmx_virtualize_apic_accesses(); } +static inline bool cpu_has_vmx_ept_execute_only(void) +{ + return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT); +} + +static inline bool cpu_has_vmx_eptp_uncacheable(void) +{ + return !!(vmx_capability.ept & VMX_EPTP_UC_BIT); +} + +static inline bool cpu_has_vmx_eptp_writeback(void) +{ + return !!(vmx_capability.ept & VMX_EPTP_WB_BIT); +} + +static inline bool cpu_has_vmx_ept_2m_page(void) +{ + return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); +} + static inline int cpu_has_vmx_invept_individual_addr(void) { return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); @@ -277,6 +314,12 @@ static inline int cpu_has_vmx_ept(void) SECONDARY_EXEC_ENABLE_EPT; } +static inline int cpu_has_vmx_unrestricted_guest(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_UNRESTRICTED_GUEST; +} + static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { return flexpriority_enabled && @@ -497,14 +540,16 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); if (!vcpu->fpu_active) eb |= 1u << NM_VECTOR; + /* + * Unconditionally intercept #DB so we can maintain dr6 without + * reading it every exit. + */ + eb |= 1u << DB_VECTOR; if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - eb |= 1u << DB_VECTOR; if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) eb |= 1u << BP_VECTOR; } - if (vcpu->arch.rmode.vm86_active) + if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ @@ -528,12 +573,15 @@ static void reload_tss(void) static void load_transition_efer(struct vcpu_vmx *vmx) { int efer_offset = vmx->msr_offset_efer; - u64 host_efer = vmx->host_msrs[efer_offset].data; - u64 guest_efer = vmx->guest_msrs[efer_offset].data; + u64 host_efer; + u64 guest_efer; u64 ignore_bits; if (efer_offset < 0) return; + host_efer = vmx->host_msrs[efer_offset].data; + guest_efer = vmx->guest_msrs[efer_offset].data; + /* * NX is emulated; LMA and LME handled by hardware; SCE meaninless * outside long mode @@ -661,7 +709,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu->cpu != cpu) { vcpu_clear(vmx); kvm_migrate_timers(vcpu); - vpid_sync_vcpu_all(vmx); + set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); local_irq_disable(); list_add(&vmx->local_vcpus_link, &per_cpu(vcpus_on_cpu, cpu)); @@ -735,12 +783,17 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { - return vmcs_readl(GUEST_RFLAGS); + unsigned long rflags; + + rflags = vmcs_readl(GUEST_RFLAGS); + if (to_vmx(vcpu)->rmode.vm86_active) + rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); + return rflags; } static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - if (vcpu->arch.rmode.vm86_active) + if (to_vmx(vcpu)->rmode.vm86_active) rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; vmcs_writel(GUEST_RFLAGS, rflags); } @@ -797,12 +850,13 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, intr_info |= INTR_INFO_DELIVER_CODE_MASK; } - if (vcpu->arch.rmode.vm86_active) { + if (vmx->rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = nr; vmx->rmode.irq.rip = kvm_rip_read(vcpu); - if (nr == BP_VECTOR || nr == OF_VECTOR) - vmx->rmode.irq.rip++; + if (kvm_exception_is_soft(nr)) + vmx->rmode.irq.rip += + vmx->vcpu.arch.event_exit_inst_len; intr_info |= INTR_TYPE_SOFT_INTR; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); @@ -940,7 +994,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_EFER: return kvm_get_msr_common(vcpu, msr_index, pdata); #endif - case MSR_IA32_TIME_STAMP_COUNTER: + case MSR_IA32_TSC: data = guest_read_tsc(); break; case MSR_IA32_SYSENTER_CS: @@ -953,9 +1007,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) data = vmcs_readl(GUEST_SYSENTER_ESP); break; default: - vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { + vmx_load_host_state(to_vmx(vcpu)); data = msr->data; break; } @@ -1000,22 +1054,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_IA32_SYSENTER_ESP: vmcs_writel(GUEST_SYSENTER_ESP, data); break; - case MSR_IA32_TIME_STAMP_COUNTER: + case MSR_IA32_TSC: rdtscll(host_tsc); guest_write_tsc(data, host_tsc); break; - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: - /* - * Just discard all writes to the performance counters; this - * should keep both older linux and windows 64-bit guests - * happy - */ - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data); - - break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, data); @@ -1024,9 +1066,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) } /* Otherwise falls through to kvm_set_msr_common */ default: - vmx_load_host_state(vmx); msr = find_msr_entry(vmx, msr_index); if (msr) { + vmx_load_host_state(vmx); msr->data = data; break; } @@ -1046,6 +1088,10 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) case VCPU_REGS_RIP: vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); break; + case VCPU_EXREG_PDPTR: + if (enable_ept) + ept_save_pdptrs(vcpu); + break; default: break; } @@ -1203,7 +1249,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_ENABLE_VPID | - SECONDARY_EXEC_ENABLE_EPT; + SECONDARY_EXEC_ENABLE_EPT | + SECONDARY_EXEC_UNRESTRICTED_GUEST; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -1217,12 +1264,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { /* CR3 accesses and invlpg don't need to cause VM Exits when EPT enabled */ - min &= ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_INVLPG_EXITING); - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, - &_cpu_based_exec_control) < 0) - return -EIO; + _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_INVLPG_EXITING); rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, vmx_capability.ept, vmx_capability.vpid); } @@ -1333,8 +1377,13 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_vpid()) enable_vpid = 0; - if (!cpu_has_vmx_ept()) + if (!cpu_has_vmx_ept()) { enable_ept = 0; + enable_unrestricted_guest = 0; + } + + if (!cpu_has_vmx_unrestricted_guest()) + enable_unrestricted_guest = 0; if (!cpu_has_vmx_flexpriority()) flexpriority_enabled = 0; @@ -1342,6 +1391,9 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_tpr_shadow()) kvm_x86_ops->update_cr8_intercept = NULL; + if (enable_ept && !cpu_has_vmx_ept_2m_page()) + kvm_disable_largepages(); + return alloc_kvm_area(); } @@ -1372,15 +1424,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); vmx->emulation_required = 1; - vcpu->arch.rmode.vm86_active = 0; + vmx->rmode.vm86_active = 0; - vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); - vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); - vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); + vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); + vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); + vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); flags = vmcs_readl(GUEST_RFLAGS); flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); - flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); + flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | @@ -1391,10 +1443,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu) if (emulate_invalid_guest_state) return; - fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); - fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); - fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); - fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); + fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es); + fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds); + fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); + fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); vmcs_write16(GUEST_SS_SELECTOR, 0); vmcs_write32(GUEST_SS_AR_BYTES, 0x93); @@ -1433,20 +1485,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu) unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); + if (enable_unrestricted_guest) + return; + vmx->emulation_required = 1; - vcpu->arch.rmode.vm86_active = 1; + vmx->rmode.vm86_active = 1; - vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); + vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); - vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); + vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); - vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); + vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); - vcpu->arch.rmode.save_iopl + vmx->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; @@ -1468,10 +1523,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CS_BASE, 0xf0000); vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); - fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); - fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); - fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); - fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); continue_rmode: kvm_mmu_reset_context(vcpu); @@ -1545,11 +1600,11 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) static void ept_load_pdptrs(struct kvm_vcpu *vcpu) { + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty)) + return; + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_ERR "EPT: Fail to load pdptrs!\n"); - return; - } vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); @@ -1557,6 +1612,21 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) } } +static void ept_save_pdptrs(struct kvm_vcpu *vcpu) +{ + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + } + + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty); +} + static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, @@ -1571,8 +1641,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, vcpu->arch.cr4); - *hw_cr0 |= X86_CR0_PE | X86_CR0_PG; - *hw_cr0 &= ~X86_CR0_WP; } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, @@ -1581,9 +1649,10 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, vcpu->arch.cr4); - if (!(vcpu->arch.cr0 & X86_CR0_WP)) - *hw_cr0 &= ~X86_CR0_WP; } + + if (!(cr0 & X86_CR0_WP)) + *hw_cr0 &= ~X86_CR0_WP; } static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, @@ -1598,15 +1667,21 @@ static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { - unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | - KVM_VM_CR0_ALWAYS_ON; + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long hw_cr0; + + if (enable_unrestricted_guest) + hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST) + | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; + else + hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; vmx_fpu_deactivate(vcpu); - if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE)) + if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); - if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE)) + if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) enter_rmode(vcpu); #ifdef CONFIG_X86_64 @@ -1650,10 +1725,8 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(cr3); vmcs_write64(EPT_POINTER, eptp); - ept_sync_context(eptp); - ept_load_pdptrs(vcpu); guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : - VMX_EPT_IDENTITY_PAGETABLE_ADDR; + vcpu->kvm->arch.ept_identity_map_addr; } vmx_flush_tlb(vcpu); @@ -1664,7 +1737,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ? + unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); vcpu->arch.cr4 = cr4; @@ -1707,16 +1780,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static int vmx_get_cpl(struct kvm_vcpu *vcpu) { - struct kvm_segment kvm_seg; - if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ return 0; if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ return 3; - vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS); - return kvm_seg.selector & 3; + return vmcs_read16(GUEST_CS_SELECTOR) & 3; } static u32 vmx_segment_access_rights(struct kvm_segment *var) @@ -1744,20 +1814,21 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { + struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; - if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) { - vcpu->arch.rmode.tr.selector = var->selector; - vcpu->arch.rmode.tr.base = var->base; - vcpu->arch.rmode.tr.limit = var->limit; - vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var); + if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { + vmx->rmode.tr.selector = var->selector; + vmx->rmode.tr.base = var->base; + vmx->rmode.tr.limit = var->limit; + vmx->rmode.tr.ar = vmx_segment_access_rights(var); return; } vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); - if (vcpu->arch.rmode.vm86_active && var->s) { + if (vmx->rmode.vm86_active && var->s) { /* * Hack real-mode segments into vm86 compatibility. */ @@ -1766,6 +1837,21 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, ar = 0xf3; } else ar = vmx_segment_access_rights(var); + + /* + * Fix the "Accessed" bit in AR field of segment registers for older + * qemu binaries. + * IA32 arch specifies that at the time of processor reset the + * "Accessed" bit in the AR field of segment registers is 1. And qemu + * is setting it to 0 in the usedland code. This causes invalid guest + * state vmexit when "unrestricted guest" mode is turned on. + * Fix for this setup issue in cpu_reset is being pushed in the qemu + * tree. Newer qemu binaries with that qemu fix would not need this + * kvm hack. + */ + if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) + ar |= 0x1; /* Accessed */ + vmcs_write32(sf->ar_bytes, ar); } @@ -2040,7 +2126,7 @@ static int init_rmode_identity_map(struct kvm *kvm) if (likely(kvm->arch.ept_identity_pagetable_done)) return 1; ret = 0; - identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT; + identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -2062,11 +2148,19 @@ out: static void seg_setup(int seg) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + unsigned int ar; vmcs_write16(sf->selector, 0); vmcs_writel(sf->base, 0); vmcs_write32(sf->limit, 0xffff); - vmcs_write32(sf->ar_bytes, 0xf3); + if (enable_unrestricted_guest) { + ar = 0x93; + if (seg == VCPU_SREG_CS) + ar |= 0x08; /* code segment */ + } else + ar = 0xf3; + + vmcs_write32(sf->ar_bytes, ar); } static int alloc_apic_access_page(struct kvm *kvm) @@ -2101,14 +2195,15 @@ static int alloc_identity_pagetable(struct kvm *kvm) goto out; kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; + kvm_userspace_mem.guest_phys_addr = + kvm->arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); if (r) goto out; kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, - VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); + kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); out: up_write(&kvm->slots_lock); return r; @@ -2209,6 +2304,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; if (!enable_ept) exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + if (!enable_unrestricted_guest) + exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -2326,14 +2423,14 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) goto out; } - vmx->vcpu.arch.rmode.vm86_active = 0; + vmx->rmode.vm86_active = 0; vmx->soft_vnmi_blocked = 0; vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(&vmx->vcpu, 0); msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; - if (vmx->vcpu.vcpu_id == 0) + if (kvm_vcpu_is_bsp(&vmx->vcpu)) msr |= MSR_IA32_APICBASE_BSP; kvm_set_apic_base(&vmx->vcpu, msr); @@ -2344,7 +2441,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. */ - if (vmx->vcpu.vcpu_id == 0) { + if (kvm_vcpu_is_bsp(&vmx->vcpu)) { vmcs_write16(GUEST_CS_SELECTOR, 0xf000); vmcs_writel(GUEST_CS_BASE, 0x000f0000); } else { @@ -2373,7 +2470,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_SYSENTER_EIP, 0); vmcs_writel(GUEST_RFLAGS, 0x02); - if (vmx->vcpu.vcpu_id == 0) + if (kvm_vcpu_is_bsp(&vmx->vcpu)) kvm_rip_write(vcpu, 0xfff0); else kvm_rip_write(vcpu, 0); @@ -2461,13 +2558,16 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) uint32_t intr; int irq = vcpu->arch.interrupt.nr; - KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); + trace_kvm_inj_virq(irq); ++vcpu->stat.irq_injections; - if (vcpu->arch.rmode.vm86_active) { + if (vmx->rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = irq; vmx->rmode.irq.rip = kvm_rip_read(vcpu); + if (vcpu->arch.interrupt.soft) + vmx->rmode.irq.rip += + vmx->vcpu.arch.event_exit_inst_len; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); @@ -2502,7 +2602,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) } ++vcpu->stat.nmi_injections; - if (vcpu->arch.rmode.vm86_active) { + if (vmx->rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = NMI_VECTOR; vmx->rmode.irq.rip = kvm_rip_read(vcpu); @@ -2659,14 +2759,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (enable_ept) BUG(); cr2 = vmcs_readl(EXIT_QUALIFICATION); - KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, - (u32)((u64)cr2 >> 32), handler); + trace_kvm_page_fault(cr2, error_code); + if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); } - if (vcpu->arch.rmode.vm86_active && + if (vmx->rmode.vm86_active && handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, error_code)) { if (vcpu->arch.halt_request) { @@ -2707,7 +2807,6 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { ++vcpu->stat.irq_exits; - KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler); return 1; } @@ -2755,7 +2854,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - unsigned long exit_qualification; + unsigned long exit_qualification, val; int cr; int reg; @@ -2764,21 +2863,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ - KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, - (u32)kvm_register_read(vcpu, reg), - (u32)((u64)kvm_register_read(vcpu, reg) >> 32), - handler); + val = kvm_register_read(vcpu, reg); + trace_kvm_cr_write(cr, val); switch (cr) { case 0: - kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); + kvm_set_cr0(vcpu, val); skip_emulated_instruction(vcpu); return 1; case 3: - kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); + kvm_set_cr3(vcpu, val); skip_emulated_instruction(vcpu); return 1; case 4: - kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); + kvm_set_cr4(vcpu, val); skip_emulated_instruction(vcpu); return 1; case 8: { @@ -2800,23 +2897,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.cr0 &= ~X86_CR0_TS; vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); vmx_fpu_activate(vcpu); - KVMTRACE_0D(CLTS, vcpu, handler); skip_emulated_instruction(vcpu); return 1; case 1: /*mov from cr*/ switch (cr) { case 3: kvm_register_write(vcpu, reg, vcpu->arch.cr3); - KVMTRACE_3D(CR_READ, vcpu, (u32)cr, - (u32)kvm_register_read(vcpu, reg), - (u32)((u64)kvm_register_read(vcpu, reg) >> 32), - handler); + trace_kvm_cr_read(cr, vcpu->arch.cr3); skip_emulated_instruction(vcpu); return 1; case 8: - kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); - KVMTRACE_2D(CR_READ, vcpu, (u32)cr, - (u32)kvm_register_read(vcpu, reg), handler); + val = kvm_get_cr8(vcpu); + kvm_register_write(vcpu, reg, val); + trace_kvm_cr_read(cr, val); skip_emulated_instruction(vcpu); return 1; } @@ -2841,6 +2934,8 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) unsigned long val; int dr, reg; + if (!kvm_require_cpl(vcpu, 0)) + return 1; dr = vmcs_readl(GUEST_DR7); if (dr & DR7_GD) { /* @@ -2884,7 +2979,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) val = 0; } kvm_register_write(vcpu, reg, val); - KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); } else { val = vcpu->arch.regs[reg]; switch (dr) { @@ -2917,7 +3011,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } break; } - KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler); } skip_emulated_instruction(vcpu); return 1; @@ -2939,8 +3032,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } - KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32), - handler); + trace_kvm_msr_read(ecx, data); /* FIXME: handling of bits 32:63 of rax, rdx */ vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; @@ -2955,8 +3047,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32), - handler); + trace_kvm_msr_write(ecx, data); if (vmx_set_msr(vcpu, ecx, data) != 0) { kvm_inject_gp(vcpu, 0); @@ -2983,7 +3074,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - KVMTRACE_0D(PEND_INTR, vcpu, handler); ++vcpu->stat.irq_window_exits; /* @@ -3049,7 +3139,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) printk(KERN_ERR "Fail to handle apic access vmexit! Offset is 0x%lx\n", offset); - return -ENOTSUPP; + return -ENOEXEC; } return 1; } @@ -3118,7 +3208,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (exit_qualification & (1 << 6)) { printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); - return -ENOTSUPP; + return -EINVAL; } gla_validity = (exit_qualification >> 7) & 0x3; @@ -3130,14 +3220,98 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", (long unsigned int)exit_qualification); kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = 0; - return -ENOTSUPP; + kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; + return 0; } gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + trace_kvm_page_fault(gpa, exit_qualification); return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); } +static u64 ept_rsvd_mask(u64 spte, int level) +{ + int i; + u64 mask = 0; + + for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) + mask |= (1ULL << i); + + if (level > 2) + /* bits 7:3 reserved */ + mask |= 0xf8; + else if (level == 2) { + if (spte & (1ULL << 7)) + /* 2MB ref, bits 20:12 reserved */ + mask |= 0x1ff000; + else + /* bits 6:3 reserved */ + mask |= 0x78; + } + + return mask; +} + +static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, + int level) +{ + printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level); + + /* 010b (write-only) */ + WARN_ON((spte & 0x7) == 0x2); + + /* 110b (write/execute) */ + WARN_ON((spte & 0x7) == 0x6); + + /* 100b (execute-only) and value not supported by logical processor */ + if (!cpu_has_vmx_ept_execute_only()) + WARN_ON((spte & 0x7) == 0x4); + + /* not 000b */ + if ((spte & 0x7)) { + u64 rsvd_bits = spte & ept_rsvd_mask(spte, level); + + if (rsvd_bits != 0) { + printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n", + __func__, rsvd_bits); + WARN_ON(1); + } + + if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) { + u64 ept_mem_type = (spte & 0x38) >> 3; + + if (ept_mem_type == 2 || ept_mem_type == 3 || + ept_mem_type == 7) { + printk(KERN_ERR "%s: ept_mem_type=0x%llx\n", + __func__, ept_mem_type); + WARN_ON(1); + } + } + } +} + +static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 sptes[4]; + int nr_sptes, i; + gpa_t gpa; + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + + printk(KERN_ERR "EPT: Misconfiguration.\n"); + printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); + + nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes); + + for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) + ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); + + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; + + return 0; +} + static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u32 cpu_based_vm_exec_control; @@ -3217,8 +3391,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, - [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, + [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, + [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, }; static const int kvm_vmx_max_exit_handlers = @@ -3234,8 +3409,7 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; - KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), - (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); + trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); /* If we need to emulate an MMIO from handle_invalid_guest_state * we just return 0 */ @@ -3247,10 +3421,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ - if (enable_ept && is_paging(vcpu)) { + if (enable_ept && is_paging(vcpu)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - ept_load_pdptrs(vcpu); - } if (unlikely(vmx->fail)) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -3326,10 +3498,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) /* We need to handle NMIs before interrupts are enabled */ if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && - (exit_intr_info & INTR_INFO_VALID_MASK)) { - KVMTRACE_0D(NMI, &vmx->vcpu, handler); + (exit_intr_info & INTR_INFO_VALID_MASK)) asm("int $2"); - } idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; @@ -3434,6 +3604,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (enable_ept && is_paging(vcpu)) { + vmcs_writel(GUEST_CR3, vcpu->arch.cr3); + ept_load_pdptrs(vcpu); + } /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) vmx->entry_time = ktime_get(); @@ -3449,12 +3623,21 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + /* When single-stepping over STI and MOV SS, we must clear the + * corresponding interruptibility bits in the guest state. Otherwise + * vmentry fails as it then expects bit 14 (BS) in pending debug + * exceptions being set, but that's not correct for the guest debugging + * case. */ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vmx_set_interrupt_shadow(vcpu, 0); + /* * Loading guest fpu may have cleared host cr0.ts */ vmcs_writel(HOST_CR0, read_cr0()); - set_debugreg(vcpu->arch.dr6, 6); + if (vcpu->arch.switch_db_regs) + set_debugreg(vcpu->arch.dr6, 6); asm( /* Store host registers */ @@ -3465,11 +3648,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%"R"sp, %c[host_rsp](%0) \n\t" __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" "1: \n\t" + /* Reload cr2 if changed */ + "mov %c[cr2](%0), %%"R"ax \n\t" + "mov %%cr2, %%"R"dx \n\t" + "cmp %%"R"ax, %%"R"dx \n\t" + "je 2f \n\t" + "mov %%"R"ax, %%cr2 \n\t" + "2: \n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ - "mov %c[cr2](%0), %%"R"ax \n\t" - "mov %%"R"ax, %%cr2 \n\t" "mov %c[rax](%0), %%"R"ax \n\t" "mov %c[rbx](%0), %%"R"bx \n\t" "mov %c[rdx](%0), %%"R"dx \n\t" @@ -3547,10 +3735,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) #endif ); - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) + | (1 << VCPU_EXREG_PDPTR)); vcpu->arch.regs_dirty = 0; - get_debugreg(vcpu->arch.dr6, 6); + if (vcpu->arch.switch_db_regs) + get_debugreg(vcpu->arch.dr6, 6); vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) @@ -3633,9 +3823,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (alloc_apic_access_page(kvm) != 0) goto free_vmcs; - if (enable_ept) + if (enable_ept) { + if (!kvm->arch.ept_identity_map_addr) + kvm->arch.ept_identity_map_addr = + VMX_EPT_IDENTITY_PAGETABLE_ADDR; if (alloc_identity_pagetable(kvm) != 0) goto free_vmcs; + } return &vmx->vcpu; @@ -3699,6 +3893,34 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return ret; } +static const struct trace_print_flags vmx_exit_reasons_str[] = { + { EXIT_REASON_EXCEPTION_NMI, "exception" }, + { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, + { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, + { EXIT_REASON_NMI_WINDOW, "nmi_window" }, + { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, + { EXIT_REASON_CR_ACCESS, "cr_access" }, + { EXIT_REASON_DR_ACCESS, "dr_access" }, + { EXIT_REASON_CPUID, "cpuid" }, + { EXIT_REASON_MSR_READ, "rdmsr" }, + { EXIT_REASON_MSR_WRITE, "wrmsr" }, + { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, + { EXIT_REASON_HLT, "halt" }, + { EXIT_REASON_INVLPG, "invlpg" }, + { EXIT_REASON_VMCALL, "hypercall" }, + { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, + { EXIT_REASON_APIC_ACCESS, "apic_access" }, + { EXIT_REASON_WBINVD, "wbinvd" }, + { EXIT_REASON_TASK_SWITCH, "task_switch" }, + { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, + { -1, NULL } +}; + +static bool vmx_gb_page_enable(void) +{ + return false; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -3758,6 +3980,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, .get_mt_mask = vmx_get_mt_mask, + + .exit_reasons_str = vmx_exit_reasons_str, + .gb_page_enable = vmx_gb_page_enable, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 633ccc7400a..4fc80174191 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -37,11 +37,17 @@ #include <linux/iommu.h> #include <linux/intel-iommu.h> #include <linux/cpufreq.h> +#include <trace/events/kvm.h> +#undef TRACE_INCLUDE_FILE +#define CREATE_TRACE_POINTS +#include "trace.h" +#include <asm/debugreg.h> #include <asm/uaccess.h> #include <asm/msr.h> #include <asm/desc.h> #include <asm/mtrr.h> +#include <asm/mce.h> #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -55,6 +61,10 @@ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) + +#define KVM_MAX_MCE_BANKS 32 +#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P + /* EFER defaults: * - enable syscall per default because its emulated by KVM * - enable LME and LMA per default on 64 bit KVM @@ -68,14 +78,16 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +static void update_cr8_intercept(struct kvm_vcpu *vcpu); static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); +int ignore_msrs = 0; +module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); + struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, { "pf_guest", VCPU_STAT(pf_guest) }, @@ -122,18 +134,16 @@ unsigned long segment_base(u16 selector) if (selector == 0) return 0; - asm("sgdt %0" : "=m"(gdt)); + kvm_get_gdt(&gdt); table_base = gdt.base; if (selector & 4) { /* from ldt */ - u16 ldt_selector; + u16 ldt_selector = kvm_read_ldt(); - asm("sldt %0" : "=g"(ldt_selector)); table_base = segment_base(ldt_selector); } d = (struct desc_struct *)(table_base + (selector & ~7)); - v = d->base0 | ((unsigned long)d->base1 << 16) | - ((unsigned long)d->base2 << 24); + v = get_desc_base(d); #ifdef CONFIG_X86_64 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; @@ -176,16 +186,22 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, ++vcpu->stat.pf_guest; if (vcpu->arch.exception.pending) { - if (vcpu->arch.exception.nr == PF_VECTOR) { - printk(KERN_DEBUG "kvm: inject_page_fault:" - " double fault 0x%lx\n", addr); - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - } else if (vcpu->arch.exception.nr == DF_VECTOR) { + switch(vcpu->arch.exception.nr) { + case DF_VECTOR: /* triple fault -> shutdown */ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return; + case PF_VECTOR: + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + return; + default: + /* replace previous exception with a new one in a hope + that instruction re-execution will regenerate lost + exception */ + vcpu->arch.exception.pending = false; + break; } - return; } vcpu->arch.cr2 = addr; kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); @@ -207,12 +223,18 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); -static void __queue_exception(struct kvm_vcpu *vcpu) +/* + * Checks if cpl <= required_cpl; if true, return true. Otherwise queue + * a #GP and return false. + */ +bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) { - kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); + if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) + return true; + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return false; } +EXPORT_SYMBOL_GPL(kvm_require_cpl); /* * Load the pae pdptrs. Return true is they are all valid. @@ -232,7 +254,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) goto out; } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { - if (is_present_pte(pdpte[i]) && + if (is_present_gpte(pdpte[i]) && (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { ret = 0; goto out; @@ -241,6 +263,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) ret = 1; memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty); out: return ret; @@ -256,6 +282,10 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) if (is_long_mode(vcpu) || !is_pae(vcpu)) return false; + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail)) + return true; + r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); if (r < 0) goto out; @@ -328,9 +358,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); - KVMTRACE_1D(LMSW, vcpu, - (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)), - handler); } EXPORT_SYMBOL_GPL(kvm_lmsw); @@ -466,7 +493,7 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA }; @@ -644,8 +671,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, - &vcpu->hv_clock.tsc_timestamp); + kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); ktime_get_ts(&ts); local_irq_restore(flags); @@ -778,23 +804,60 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 0; } +static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + + switch (msr) { + case MSR_IA32_MCG_STATUS: + vcpu->arch.mcg_status = data; + break; + case MSR_IA32_MCG_CTL: + if (!(mcg_cap & MCG_CTL_P)) + return 1; + if (data != 0 && data != ~(u64)0) + return -1; + vcpu->arch.mcg_ctl = data; + break; + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + u32 offset = msr - MSR_IA32_MC0_CTL; + /* only 0 or all 1s can be written to IA32_MCi_CTL */ + if ((offset & 0x3) == 0 && + data != 0 && data != ~(u64)0) + return -1; + vcpu->arch.mce_banks[offset] = data; + break; + } + return 1; + } + return 0; +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { case MSR_EFER: set_efer(vcpu, data); break; - case MSR_IA32_MC0_STATUS: - pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", - __func__, data); + case MSR_K7_HWCR: + data &= ~(u64)0x40; /* ignore flush filter disable */ + if (data != 0) { + pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", + data); + return 1; + } break; - case MSR_IA32_MCG_STATUS: - pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", - __func__, data); + case MSR_FAM10H_MMIO_CONF_BASE: + if (data != 0) { + pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " + "0x%llx\n", data); + return 1; + } break; - case MSR_IA32_MCG_CTL: - pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", - __func__, data); + case MSR_AMD64_NB_CFG: break; case MSR_IA32_DEBUGCTLMSR: if (!data) { @@ -811,12 +874,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: case MSR_VM_HSAVE_PA: + case MSR_AMD64_PATCH_LOADER: break; case 0x200 ... 0x2ff: return set_msr_mtrr(vcpu, msr, data); case MSR_IA32_APICBASE: kvm_set_apic_base(vcpu, data); break; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + return kvm_x2apic_msr_write(vcpu, msr, data); case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; @@ -850,9 +916,50 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm_request_guest_time_update(vcpu); break; } + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + return set_msr_mce(vcpu, msr, data); + + /* Performance counters are not protected by a CPUID bit, + * so we should check all of them in the generic path for the sake of + * cross vendor migration. + * Writing a zero into the event select MSRs disables them, + * which we perfectly emulate ;-). Any other value should be at least + * reported, some guests depend on them. + */ + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + case MSR_K7_EVNTSEL0: + case MSR_K7_EVNTSEL1: + case MSR_K7_EVNTSEL2: + case MSR_K7_EVNTSEL3: + if (data != 0) + pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); + break; + /* at least RHEL 4 unconditionally writes to the perfctr registers, + * so we ignore writes to make it happy. + */ + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); + break; default: - pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); - return 1; + if (!ignore_msrs) { + pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", + msr, data); + return 1; + } else { + pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", + msr, data); + break; + } } return 0; } @@ -905,26 +1012,47 @@ static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; switch (msr) { - case 0xc0010010: /* SYSCFG */ - case 0xc0010015: /* HWCR */ - case MSR_IA32_PLATFORM_ID: case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: - case MSR_IA32_MC0_CTL: - case MSR_IA32_MCG_STATUS: + data = 0; + break; case MSR_IA32_MCG_CAP: + data = vcpu->arch.mcg_cap; + break; case MSR_IA32_MCG_CTL: - case MSR_IA32_MC0_MISC: - case MSR_IA32_MC0_MISC+4: - case MSR_IA32_MC0_MISC+8: - case MSR_IA32_MC0_MISC+12: - case MSR_IA32_MC0_MISC+16: - case MSR_IA32_MC0_MISC+20: + if (!(mcg_cap & MCG_CTL_P)) + return 1; + data = vcpu->arch.mcg_ctl; + break; + case MSR_IA32_MCG_STATUS: + data = vcpu->arch.mcg_status; + break; + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + u32 offset = msr - MSR_IA32_MC0_CTL; + data = vcpu->arch.mce_banks[offset]; + break; + } + return 1; + } + *pdata = data; + return 0; +} + +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data; + + switch (msr) { + case MSR_IA32_PLATFORM_ID: case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_DEBUGCTLMSR: @@ -932,10 +1060,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_LASTBRANCHTOIP: case MSR_IA32_LASTINTFROMIP: case MSR_IA32_LASTINTTOIP: + case MSR_K8_SYSCFG: + case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0: case MSR_P6_EVNTSEL1: case MSR_K7_EVNTSEL0: + case MSR_K7_PERFCTR0: + case MSR_K8_INT_PENDING_MSG: + case MSR_AMD64_NB_CFG: + case MSR_FAM10H_MMIO_CONF_BASE: data = 0; break; case MSR_MTRRcap: @@ -949,6 +1085,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_APICBASE: data = kvm_get_apic_base(vcpu); break; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + return kvm_x2apic_msr_read(vcpu, msr, pdata); + break; case MSR_IA32_MISC_ENABLE: data = vcpu->arch.ia32_misc_enable_msr; break; @@ -967,9 +1106,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_KVM_SYSTEM_TIME: data = vcpu->arch.time; break; + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + return get_msr_mce(vcpu, msr, pdata); default: - pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); - return 1; + if (!ignore_msrs) { + pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + return 1; + } else { + pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); + data = 0; + } + break; } *pdata = data; return 0; @@ -1068,6 +1220,11 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_ASSIGN_DEV_IRQ: + case KVM_CAP_IRQFD: + case KVM_CAP_IOEVENTFD: + case KVM_CAP_PIT2: + case KVM_CAP_PIT_STATE2: + case KVM_CAP_SET_IDENTITY_MAP_ADDR: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1088,6 +1245,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IOMMU: r = iommu_found(); break; + case KVM_CAP_MCE: + r = KVM_MAX_MCE_BANKS; + break; default: r = 0; break; @@ -1147,6 +1307,16 @@ long kvm_arch_dev_ioctl(struct file *filp, r = 0; break; } + case KVM_X86_GET_MCE_CAP_SUPPORTED: { + u64 mce_cap; + + mce_cap = KVM_MCE_CAP_SUPPORTED; + r = -EFAULT; + if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) + goto out; + r = 0; + break; + } default: r = -EINVAL; } @@ -1227,6 +1397,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_nent = cpuid->nent; cpuid_fix_nx_cap(vcpu); r = 0; + kvm_apic_set_version(vcpu); out_free: vfree(cpuid_entries); @@ -1248,6 +1419,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; vcpu->arch.cpuid_nent = cpuid->nent; + kvm_apic_set_version(vcpu); return 0; out: @@ -1290,6 +1462,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index, int *nent, int maxnent) { unsigned f_nx = is_efer_nx() ? F(NX) : 0; + unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; #ifdef CONFIG_X86_64 unsigned f_lm = F(LM); #else @@ -1314,7 +1487,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* Reserved */ | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ | + F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = @@ -1323,7 +1496,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 0 /* Reserved, DCA */ | F(XMM4_1) | - F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) | + F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 0 /* Reserved, XSAVE, OSXSAVE */; /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = @@ -1344,6 +1517,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case 1: entry->edx &= kvm_supported_word0_x86_features; entry->ecx &= kvm_supported_word4_x86_features; + /* we support x2apic emulation even if host does not support + * it since we emulate x2apic in software */ + entry->ecx |= F(X2APIC); break; /* function 2 entries are STATEFUL. That is, repeated cpuid commands * may return different values. This forces us to get_cpu() before @@ -1416,6 +1592,8 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, if (cpuid->nent < 1) goto out; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + cpuid->nent = KVM_MAX_CPUID_ENTRIES; r = -ENOMEM; cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); if (!cpuid_entries) @@ -1435,6 +1613,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent); + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + r = -EFAULT; if (copy_to_user(entries, cpuid_entries, nent * sizeof(struct kvm_cpuid_entry2))) @@ -1464,6 +1646,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, vcpu_load(vcpu); memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); kvm_apic_post_state_restore(vcpu); + update_cr8_intercept(vcpu); vcpu_put(vcpu); return 0; @@ -1503,6 +1686,80 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, return 0; } +static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, + u64 mcg_cap) +{ + int r; + unsigned bank_num = mcg_cap & 0xff, bank; + + r = -EINVAL; + if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) + goto out; + if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) + goto out; + r = 0; + vcpu->arch.mcg_cap = mcg_cap; + /* Init IA32_MCG_CTL to all 1s */ + if (mcg_cap & MCG_CTL_P) + vcpu->arch.mcg_ctl = ~(u64)0; + /* Init IA32_MCi_CTL to all 1s */ + for (bank = 0; bank < bank_num; bank++) + vcpu->arch.mce_banks[bank*4] = ~(u64)0; +out: + return r; +} + +static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, + struct kvm_x86_mce *mce) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + u64 *banks = vcpu->arch.mce_banks; + + if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) + return -EINVAL; + /* + * if IA32_MCG_CTL is not all 1s, the uncorrected error + * reporting is disabled + */ + if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && + vcpu->arch.mcg_ctl != ~(u64)0) + return 0; + banks += 4 * mce->bank; + /* + * if IA32_MCi_CTL is not all 1s, the uncorrected error + * reporting is disabled for the bank + */ + if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) + return 0; + if (mce->status & MCI_STATUS_UC) { + if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || + !(vcpu->arch.cr4 & X86_CR4_MCE)) { + printk(KERN_DEBUG "kvm: set_mce: " + "injects mce exception while " + "previous one is in progress!\n"); + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return 0; + } + if (banks[1] & MCI_STATUS_VAL) + mce->status |= MCI_STATUS_OVER; + banks[2] = mce->addr; + banks[3] = mce->misc; + vcpu->arch.mcg_status = mce->mcg_status; + banks[1] = mce->status; + kvm_queue_exception(vcpu, MC_VECTOR); + } else if (!(banks[1] & MCI_STATUS_VAL) + || !(banks[1] & MCI_STATUS_UC)) { + if (banks[1] & MCI_STATUS_VAL) + mce->status |= MCI_STATUS_OVER; + banks[2] = mce->addr; + banks[3] = mce->misc; + banks[1] = mce->status; + } else + banks[1] |= MCI_STATUS_OVER; + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1636,6 +1893,24 @@ long kvm_arch_vcpu_ioctl(struct file *filp, kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); break; } + case KVM_X86_SETUP_MCE: { + u64 mcg_cap; + + r = -EFAULT; + if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) + goto out; + r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); + break; + } + case KVM_X86_SET_MCE: { + struct kvm_x86_mce mce; + + r = -EFAULT; + if (copy_from_user(&mce, argp, sizeof mce)) + goto out; + r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); + break; + } default: r = -EINVAL; } @@ -1654,6 +1929,13 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) return ret; } +static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, + u64 ident_addr) +{ + kvm->arch.ept_identity_map_addr = ident_addr; + return 0; +} + static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, u32 kvm_nr_mmu_pages) { @@ -1775,19 +2057,25 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: + spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); + spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: + spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); + spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: + mutex_lock(&kvm->irq_lock); memcpy(ioapic_irqchip(kvm), &chip->chip.ioapic, sizeof(struct kvm_ioapic_state)); + mutex_unlock(&kvm->irq_lock); break; default: r = -EINVAL; @@ -1801,7 +2089,9 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) { int r = 0; + mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); + mutex_unlock(&kvm->arch.vpit->pit_state.lock); return r; } @@ -1809,8 +2099,39 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) { int r = 0; + mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); - kvm_pit_load_count(kvm, 0, ps->channels[0].count); + kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + return r; +} + +static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + int r = 0; + + mutex_lock(&kvm->arch.vpit->pit_state.lock); + memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, + sizeof(ps->channels)); + ps->flags = kvm->arch.vpit->pit_state.flags; + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + return r; +} + +static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + int r = 0, start = 0; + u32 prev_legacy, cur_legacy; + mutex_lock(&kvm->arch.vpit->pit_state.lock); + prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; + cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; + if (!prev_legacy && cur_legacy) + start = 1; + memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, + sizeof(kvm->arch.vpit->pit_state.channels)); + kvm->arch.vpit->pit_state.flags = ps->flags; + kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); + mutex_unlock(&kvm->arch.vpit->pit_state.lock); return r; } @@ -1819,7 +2140,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, { if (!kvm->arch.vpit) return -ENXIO; + mutex_lock(&kvm->arch.vpit->pit_state.lock); kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; + mutex_unlock(&kvm->arch.vpit->pit_state.lock); return 0; } @@ -1845,7 +2168,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); - kvm_flush_remote_tlbs(kvm); memslot = &kvm->memslots[log->slot]; n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; memset(memslot->dirty_bitmap, 0, n); @@ -1869,7 +2191,9 @@ long kvm_arch_vm_ioctl(struct file *filp, */ union { struct kvm_pit_state ps; + struct kvm_pit_state2 ps2; struct kvm_memory_alias alias; + struct kvm_pit_config pit_config; } u; switch (ioctl) { @@ -1878,6 +2202,17 @@ long kvm_arch_vm_ioctl(struct file *filp, if (r < 0) goto out; break; + case KVM_SET_IDENTITY_MAP_ADDR: { + u64 ident_addr; + + r = -EFAULT; + if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) + goto out; + r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); + if (r < 0) + goto out; + break; + } case KVM_SET_MEMORY_REGION: { struct kvm_memory_region kvm_mem; struct kvm_userspace_memory_region kvm_userspace_mem; @@ -1930,16 +2265,24 @@ long kvm_arch_vm_ioctl(struct file *filp, } break; case KVM_CREATE_PIT: - mutex_lock(&kvm->lock); + u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; + goto create_pit; + case KVM_CREATE_PIT2: + r = -EFAULT; + if (copy_from_user(&u.pit_config, argp, + sizeof(struct kvm_pit_config))) + goto out; + create_pit: + down_write(&kvm->slots_lock); r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; r = -ENOMEM; - kvm->arch.vpit = kvm_create_pit(kvm); + kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); if (kvm->arch.vpit) r = 0; create_pit_unlock: - mutex_unlock(&kvm->lock); + up_write(&kvm->slots_lock); break; case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { @@ -1950,10 +2293,10 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; if (irqchip_in_kernel(kvm)) { __s32 status; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->irq_lock); status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irq_event.irq, irq_event.level); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->irq_lock); if (ioctl == KVM_IRQ_LINE_STATUS) { irq_event.status = status; if (copy_to_user(argp, &irq_event, @@ -2042,6 +2385,32 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_GET_PIT2: { + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; + r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) + goto out; + r = 0; + break; + } + case KVM_SET_PIT2: { + r = -EFAULT; + if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) + goto out; + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; + r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); + if (r) + goto out; + r = 0; + break; + } case KVM_REINJECT_CONTROL: { struct kvm_reinject_control control; r = -EFAULT; @@ -2075,35 +2444,23 @@ static void kvm_init_msr_list(void) num_msrs_to_save = j; } -/* - * Only apic need an MMIO device hook, so shortcut now.. - */ -static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, - gpa_t addr, int len, - int is_write) +static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, + const void *v) { - struct kvm_io_device *dev; + if (vcpu->arch.apic && + !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) + return 0; - if (vcpu->arch.apic) { - dev = &vcpu->arch.apic->dev; - if (dev->in_range(dev, addr, len, is_write)) - return dev; - } - return NULL; + return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); } - -static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, - gpa_t addr, int len, - int is_write) +static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) { - struct kvm_io_device *dev; + if (vcpu->arch.apic && + !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) + return 0; - dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); - if (dev == NULL) - dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, - is_write); - return dev; + return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); } static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, @@ -2172,11 +2529,12 @@ static int emulator_read_emulated(unsigned long addr, unsigned int bytes, struct kvm_vcpu *vcpu) { - struct kvm_io_device *mmio_dev; gpa_t gpa; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, + vcpu->mmio_phys_addr, *(u64 *)val); vcpu->mmio_read_completed = 0; return X86EMUL_CONTINUE; } @@ -2197,14 +2555,12 @@ mmio: /* * Is this MMIO handled locally? */ - mutex_lock(&vcpu->kvm->lock); - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); - if (mmio_dev) { - kvm_iodevice_read(mmio_dev, gpa, bytes, val); - mutex_unlock(&vcpu->kvm->lock); + if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { + trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); return X86EMUL_CONTINUE; } - mutex_unlock(&vcpu->kvm->lock); + + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -2231,7 +2587,6 @@ static int emulator_write_emulated_onepage(unsigned long addr, unsigned int bytes, struct kvm_vcpu *vcpu) { - struct kvm_io_device *mmio_dev; gpa_t gpa; gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); @@ -2249,17 +2604,12 @@ static int emulator_write_emulated_onepage(unsigned long addr, return X86EMUL_CONTINUE; mmio: + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); /* * Is this MMIO handled locally? */ - mutex_lock(&vcpu->kvm->lock); - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); - if (mmio_dev) { - kvm_iodevice_write(mmio_dev, gpa, bytes, val); - mutex_unlock(&vcpu->kvm->lock); + if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) return X86EMUL_CONTINUE; - } - mutex_unlock(&vcpu->kvm->lock); vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -2343,7 +2693,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_clts(struct kvm_vcpu *vcpu) { - KVMTRACE_0D(CLTS, vcpu, handler); kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); return X86EMUL_CONTINUE; } @@ -2420,7 +2769,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; /* - * TODO: fix x86_emulate.c to use guest_read/write_register + * TODO: fix emulate.c to use guest_read/write_register * instead of direct ->regs accesses, can save hundred cycles * on Intel for instructions that don't read/change RSP, for * for example. @@ -2444,14 +2793,33 @@ int emulate_instruction(struct kvm_vcpu *vcpu, r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); - /* Reject the instructions other than VMCALL/VMMCALL when - * try to emulate invalid opcode */ + /* Only allow emulation of specific instructions on #UD + * (namely VMMCALL, sysenter, sysexit, syscall)*/ c = &vcpu->arch.emulate_ctxt.decode; - if ((emulation_type & EMULTYPE_TRAP_UD) && - (!(c->twobyte && c->b == 0x01 && - (c->modrm_reg == 0 || c->modrm_reg == 3) && - c->modrm_mod == 3 && c->modrm_rm == 1))) - return EMULATE_FAIL; + if (emulation_type & EMULTYPE_TRAP_UD) { + if (!c->twobyte) + return EMULATE_FAIL; + switch (c->b) { + case 0x01: /* VMMCALL */ + if (c->modrm_mod != 3 || c->modrm_rm != 1) + return EMULATE_FAIL; + break; + case 0x34: /* sysenter */ + case 0x35: /* sysexit */ + if (c->modrm_mod != 0 || c->modrm_rm != 0) + return EMULATE_FAIL; + break; + case 0x05: /* syscall */ + if (c->modrm_mod != 0 || c->modrm_rm != 0) + return EMULATE_FAIL; + break; + default: + return EMULATE_FAIL; + } + + if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) + return EMULATE_FAIL; + } ++vcpu->stat.insn_emulation; if (r) { @@ -2571,52 +2939,40 @@ int complete_pio(struct kvm_vcpu *vcpu) return 0; } -static void kernel_pio(struct kvm_io_device *pio_dev, - struct kvm_vcpu *vcpu, - void *pd) +static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) { /* TODO: String I/O for in kernel device */ + int r; - mutex_lock(&vcpu->kvm->lock); if (vcpu->arch.pio.in) - kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, - vcpu->arch.pio.size, - pd); + r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); else - kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, - vcpu->arch.pio.size, - pd); - mutex_unlock(&vcpu->kvm->lock); + r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); + return r; } -static void pio_string_write(struct kvm_io_device *pio_dev, - struct kvm_vcpu *vcpu) +static int pio_string_write(struct kvm_vcpu *vcpu) { struct kvm_pio_request *io = &vcpu->arch.pio; void *pd = vcpu->arch.pio_data; - int i; + int i, r = 0; - mutex_lock(&vcpu->kvm->lock); for (i = 0; i < io->cur_count; i++) { - kvm_iodevice_write(pio_dev, io->port, - io->size, - pd); + if (kvm_io_bus_write(&vcpu->kvm->pio_bus, + io->port, io->size, pd)) { + r = -EOPNOTSUPP; + break; + } pd += io->size; } - mutex_unlock(&vcpu->kvm->lock); -} - -static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, - gpa_t addr, int len, - int is_write) -{ - return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write); + return r; } int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned port) { - struct kvm_io_device *pio_dev; unsigned long val; vcpu->run->exit_reason = KVM_EXIT_IO; @@ -2630,19 +2986,13 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.down = 0; vcpu->arch.pio.rep = 0; - if (vcpu->run->io.direction == KVM_EXIT_IO_IN) - KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, - handler); - else - KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, - handler); + trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, + size, 1); val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(vcpu->arch.pio_data, &val, 4); - pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); - if (pio_dev) { - kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { complete_pio(vcpu); return 1; } @@ -2656,7 +3006,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, { unsigned now, in_page; int ret = 0; - struct kvm_io_device *pio_dev; vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -2669,12 +3018,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.down = down; vcpu->arch.pio.rep = rep; - if (vcpu->run->io.direction == KVM_EXIT_IO_IN) - KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, - handler); - else - KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, - handler); + trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, + size, count); if (!count) { kvm_x86_ops->skip_emulated_instruction(vcpu); @@ -2704,9 +3049,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.guest_gva = address; - pio_dev = vcpu_find_pio_dev(vcpu, port, - vcpu->arch.pio.cur_count, - !vcpu->arch.pio.in); if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); @@ -2714,16 +3056,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, kvm_inject_gp(vcpu, 0); return 1; } - if (ret == 0 && pio_dev) { - pio_string_write(pio_dev, vcpu); + if (ret == 0 && !pio_string_write(vcpu)) { complete_pio(vcpu); if (vcpu->arch.pio.count == 0) ret = 1; } - } else if (pio_dev) - pr_unimpl(vcpu, "no string pio read support yet, " - "port %x size %d count %ld\n", - port, size, count); + } + /* no string PIO read support yet */ return ret; } @@ -2756,10 +3095,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (!vcpu) - continue; + kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) continue; if (!kvm_request_guest_time_update(vcpu)) @@ -2852,7 +3188,6 @@ void kvm_arch_exit(void) int kvm_emulate_halt(struct kvm_vcpu *vcpu) { ++vcpu->stat.halt_exits; - KVMTRACE_0D(HLT, vcpu, handler); if (irqchip_in_kernel(vcpu->kvm)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; return 1; @@ -2883,7 +3218,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); - KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); + trace_kvm_hypercall(nr, a0, a1, a2, a3); if (!is_long_mode(vcpu)) { nr &= 0xFFFFFFFF; @@ -2893,6 +3228,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a3 &= 0xFFFFFFFF; } + if (kvm_x86_ops->get_cpl(vcpu) != 0) { + ret = -KVM_EPERM; + goto out; + } + switch (nr) { case KVM_HC_VAPIC_POLL_IRQ: ret = 0; @@ -2904,6 +3244,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) ret = -KVM_ENOSYS; break; } +out: kvm_register_write(vcpu, VCPU_REGS_RAX, ret); ++vcpu->stat.hypercalls; return r; @@ -2983,8 +3324,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); return 0; } - KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value, - (u32)((u64)value >> 32), handler); return value; } @@ -2992,9 +3331,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, unsigned long *rflags) { - KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val, - (u32)((u64)val >> 32), handler); - switch (cr) { case 0: kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); @@ -3104,11 +3440,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); } kvm_x86_ops->skip_emulated_instruction(vcpu); - KVMTRACE_5D(CPUID, vcpu, function, - (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), - (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), - (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), - (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); + trace_kvm_cpuid(function, + kvm_register_read(vcpu, VCPU_REGS_RAX), + kvm_register_read(vcpu, VCPU_REGS_RBX), + kvm_register_read(vcpu, VCPU_REGS_RCX), + kvm_register_read(vcpu, VCPU_REGS_RDX)); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); @@ -3174,6 +3510,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) if (!kvm_x86_ops->update_cr8_intercept) return; + if (!vcpu->arch.apic) + return; + if (!vcpu->arch.apic->vapic_addr) max_irr = kvm_lapic_find_highest_irr(vcpu); else @@ -3187,12 +3526,16 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); } -static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - kvm_x86_ops->set_interrupt_shadow(vcpu, 0); - /* try to reinject previous events if any */ + if (vcpu->arch.exception.pending) { + kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code); + return; + } + if (vcpu->arch.nmi_injected) { kvm_x86_ops->set_nmi(vcpu); return; @@ -3266,16 +3609,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) smp_mb__after_clear_bit(); if (vcpu->requests || need_resched() || signal_pending(current)) { + set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); preempt_enable(); r = 1; goto out; } - if (vcpu->arch.exception.pending) - __queue_exception(vcpu); - else - inject_pending_irq(vcpu, kvm_run); + inject_pending_event(vcpu, kvm_run); /* enable NMI/IRQ window open exits if needed */ if (vcpu->arch.nmi_pending) @@ -3292,14 +3633,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_guest_enter(); - get_debugreg(vcpu->arch.host_dr6, 6); - get_debugreg(vcpu->arch.host_dr7, 7); if (unlikely(vcpu->arch.switch_db_regs)) { - get_debugreg(vcpu->arch.host_db[0], 0); - get_debugreg(vcpu->arch.host_db[1], 1); - get_debugreg(vcpu->arch.host_db[2], 2); - get_debugreg(vcpu->arch.host_db[3], 3); - set_debugreg(0, 7); set_debugreg(vcpu->arch.eff_db[0], 0); set_debugreg(vcpu->arch.eff_db[1], 1); @@ -3307,18 +3641,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) set_debugreg(vcpu->arch.eff_db[3], 3); } - KVMTRACE_0D(VMENTRY, vcpu, entryexit); + trace_kvm_entry(vcpu->vcpu_id); kvm_x86_ops->run(vcpu, kvm_run); - if (unlikely(vcpu->arch.switch_db_regs)) { - set_debugreg(0, 7); - set_debugreg(vcpu->arch.host_db[0], 0); - set_debugreg(vcpu->arch.host_db[1], 1); - set_debugreg(vcpu->arch.host_db[2], 2); - set_debugreg(vcpu->arch.host_db[3], 3); - } - set_debugreg(vcpu->arch.host_dr6, 6); - set_debugreg(vcpu->arch.host_dr7, 7); + /* + * If the guest has used debug registers, at least dr7 + * will be disabled while returning to the host. + * If we don't have active breakpoints in the host, we don't + * care about the messed up debug address registers. But if + * we have some of them active, restore the old state. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); @@ -3648,11 +3982,8 @@ static void kvm_set_segment(struct kvm_vcpu *vcpu, static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, struct kvm_segment *kvm_desct) { - kvm_desct->base = seg_desc->base0; - kvm_desct->base |= seg_desc->base1 << 16; - kvm_desct->base |= seg_desc->base2 << 24; - kvm_desct->limit = seg_desc->limit0; - kvm_desct->limit |= seg_desc->limit << 16; + kvm_desct->base = get_desc_base(seg_desc); + kvm_desct->limit = get_desc_limit(seg_desc); if (seg_desc->g) { kvm_desct->limit <<= 12; kvm_desct->limit |= 0xfff; @@ -3696,7 +4027,6 @@ static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct desc_struct *seg_desc) { - gpa_t gpa; struct descriptor_table dtable; u16 index = selector >> 3; @@ -3706,16 +4036,13 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); return 1; } - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); - gpa += index * 8; - return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8); + return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); } /* allowed just for 8 bytes segments */ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct desc_struct *seg_desc) { - gpa_t gpa; struct descriptor_table dtable; u16 index = selector >> 3; @@ -3723,19 +4050,13 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (dtable.limit < index * 8 + 7) return 1; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); - gpa += index * 8; - return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8); + return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); } -static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, +static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) { - u32 base_addr; - - base_addr = seg_desc->base0; - base_addr |= (seg_desc->base1 << 16); - base_addr |= (seg_desc->base2 << 24); + u32 base_addr = get_desc_base(seg_desc); return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); } @@ -3780,12 +4101,19 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se return 0; } +static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) +{ + return (seg != VCPU_SREG_LDTR) && + (seg != VCPU_SREG_TR) && + (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); +} + int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int type_bits, int seg) { struct kvm_segment kvm_seg; - if (!(vcpu->arch.cr0 & X86_CR0_PE)) + if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) return kvm_load_realmode_segment(vcpu, selector, seg); if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) return 1; @@ -4024,7 +4352,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) } } - if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) { + if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); return 1; } @@ -4094,13 +4422,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, vcpu->arch.cr2 = sregs->cr2; mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; - - down_read(&vcpu->kvm->slots_lock); - if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT)) - vcpu->arch.cr3 = sregs->cr3; - else - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); - up_read(&vcpu->kvm->slots_lock); + vcpu->arch.cr3 = sregs->cr3; kvm_set_cr8(vcpu, sregs->cr8); @@ -4142,8 +4464,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + update_cr8_intercept(vcpu); + /* Older userspace won't unhalt the vcpu on reset. */ - if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && + if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && !(vcpu->arch.cr0 & X86_CR0_PE)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -4414,7 +4738,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm = vcpu->kvm; vcpu->arch.mmu.root_hpa = INVALID_PAGE; - if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) + if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; @@ -4436,6 +4760,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_mmu_destroy; } + vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + GFP_KERNEL); + if (!vcpu->arch.mce_banks) { + r = -ENOMEM; + goto fail_mmu_destroy; + } + vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + return 0; fail_mmu_destroy: @@ -4483,20 +4815,22 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) static void kvm_free_vcpus(struct kvm *kvm) { unsigned int i; + struct kvm_vcpu *vcpu; /* * Unpin any mmu pages first. */ - for (i = 0; i < KVM_MAX_VCPUS; ++i) - if (kvm->vcpus[i]) - kvm_unload_vcpu_mmu(kvm->vcpus[i]); - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - if (kvm->vcpus[i]) { - kvm_arch_vcpu_free(kvm->vcpus[i]); - kvm->vcpus[i] = NULL; - } - } + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_unload_vcpu_mmu(vcpu); + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_arch_vcpu_free(vcpu); + + mutex_lock(&kvm->lock); + for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) + kvm->vcpus[i] = NULL; + atomic_set(&kvm->online_vcpus, 0); + mutex_unlock(&kvm->lock); } void kvm_arch_sync_events(struct kvm *kvm) @@ -4573,7 +4907,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm, kvm_mmu_slot_remove_write_access(kvm, mem->slot); spin_unlock(&kvm->mmu_lock); - kvm_flush_remote_tlbs(kvm); return 0; } @@ -4587,8 +4920,10 @@ void kvm_arch_flush_shadow(struct kvm *kvm) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE - || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED - || vcpu->arch.nmi_pending; + || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED + || vcpu->arch.nmi_pending || + (kvm_arch_interrupt_allowed(vcpu) && + kvm_cpu_has_interrupt(vcpu)); } void kvm_vcpu_kick(struct kvm_vcpu *vcpu) @@ -4612,3 +4947,9 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { return kvm_x86_ops->interrupt_allowed(vcpu); } + +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 4c8e10af78e..5eadea585d2 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -31,4 +31,8 @@ static inline bool kvm_exception_is_soft(unsigned int nr) { return (nr == BP_VECTOR) || (nr == OF_VECTOR); } + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index); + #endif diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index d677fa9ca65..7e59dc1d3fc 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1135,11 +1135,6 @@ static struct notifier_block paniced = { /* Setting up memory is fairly easy. */ static __init char *lguest_memory_setup(void) { - /* We do this here and not earlier because lockcheck used to barf if we - * did it before start_kernel(). I think we fixed that, so it'd be - * nice to move it back to lguest_init. Patch welcome... */ - atomic_notifier_chain_register(&panic_notifier_list, &paniced); - /* *The Linux bootloader header contains an "e820" memory map: the * Launcher populated the first entry with our memory limit. @@ -1262,7 +1257,6 @@ __init void lguest_init(void) */ /* Interrupt-related operations */ - pv_irq_ops.init_IRQ = lguest_init_IRQ; pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); @@ -1270,7 +1264,6 @@ __init void lguest_init(void) pv_irq_ops.safe_halt = lguest_safe_halt; /* Setup operations */ - pv_init_ops.memory_setup = lguest_memory_setup; pv_init_ops.patch = lguest_patch; /* Intercepts of various CPU instructions */ @@ -1320,10 +1313,11 @@ __init void lguest_init(void) set_lguest_basic_apic_ops(); #endif - /* Time operations */ - pv_time_ops.get_wallclock = lguest_get_wallclock; - pv_time_ops.time_init = lguest_time_init; - pv_time_ops.get_tsc_khz = lguest_tsc_khz; + x86_init.resources.memory_setup = lguest_memory_setup; + x86_init.irqs.intr_init = lguest_init_IRQ; + x86_init.timers.timer_init = lguest_time_init; + x86_platform.calibrate_tsc = lguest_tsc_khz; + x86_platform.get_wallclock = lguest_get_wallclock; /* * Now is a good time to look at the implementations of these functions @@ -1365,10 +1359,13 @@ __init void lguest_init(void) /* * If we don't initialize the lock dependency checker now, it crashes - * paravirt_disable_iospace. + * atomic_notifier_chain_register, then paravirt_disable_iospace. */ lockdep_init(); + /* Hook in our special panic hypercall code. */ + atomic_notifier_chain_register(&panic_notifier_list, &paniced); + /* * The IDE code spends about 3 seconds probing for disks: if we reserve * all the I/O ports up front it can't get them and so doesn't probe. diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore new file mode 100644 index 00000000000..8df89f0a3fe --- /dev/null +++ b/arch/x86/lib/.gitignore @@ -0,0 +1 @@ +inat-tables.c diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 9e609206fac..a2d6472895f 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -2,12 +2,25 @@ # Makefile for x86 specific library files. # +inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk +inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt +quiet_cmd_inat_tables = GEN $@ + cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ + +$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) + $(call cmd,inat_tables) + +$(obj)/inat.o: $(obj)/inat-tables.c + +clean-files := inat-tables.c + obj-$(CONFIG_SMP) := msr.o lib-y := delay.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o +lib-y += insn.o inat.o obj-y += msr-reg.o msr-reg-export.o @@ -16,7 +29,9 @@ ifeq ($(CONFIG_X86_32),y) lib-y += checksum_32.o lib-y += strstr_32.o lib-y += semaphore_32.o string_32.o - +ifneq ($(CONFIG_X86_CMPXCHG64),y) + lib-y += cmpxchg8b_emu.o +endif lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o else obj-y += io_64.o iomap_copy_64.o diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S new file mode 100644 index 00000000000..828cb710dec --- /dev/null +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -0,0 +1,57 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + */ + +#include <linux/linkage.h> +#include <asm/alternative-asm.h> +#include <asm/frame.h> +#include <asm/dwarf2.h> + + +.text + +/* + * Inputs: + * %esi : memory location to compare + * %eax : low 32 bits of old value + * %edx : high 32 bits of old value + * %ebx : low 32 bits of new value + * %ecx : high 32 bits of new value + */ +ENTRY(cmpxchg8b_emu) +CFI_STARTPROC + +# +# Emulate 'cmpxchg8b (%esi)' on UP except we don't +# set the whole ZF thing (caller will just compare +# eax:edx with the expected value) +# +cmpxchg8b_emu: + pushfl + cli + + cmpl (%esi), %eax + jne not_same + cmpl 4(%esi), %edx + jne half_same + + movl %ebx, (%esi) + movl %ecx, 4(%esi) + + popfl + ret + + not_same: + movl (%esi), %eax + half_same: + movl 4(%esi), %edx + + popfl + ret + +CFI_ENDPROC +ENDPROC(cmpxchg8b_emu) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 6ba0f7bb85e..cf889d4e076 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -65,7 +65,7 @@ .endm /* Standard copy_to_user with segment limit checking */ -ENTRY(copy_to_user) +ENTRY(_copy_to_user) CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rdi,%rcx @@ -75,10 +75,10 @@ ENTRY(copy_to_user) jae bad_to_user ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC -ENDPROC(copy_to_user) +ENDPROC(_copy_to_user) /* Standard copy_from_user with segment limit checking */ -ENTRY(copy_from_user) +ENTRY(_copy_from_user) CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rsi,%rcx @@ -88,7 +88,7 @@ ENTRY(copy_from_user) jae bad_from_user ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC -ENDPROC(copy_from_user) +ENDPROC(_copy_from_user) ENTRY(copy_user_generic) CFI_STARTPROC @@ -96,12 +96,6 @@ ENTRY(copy_user_generic) CFI_ENDPROC ENDPROC(copy_user_generic) -ENTRY(__copy_from_user_inatomic) - CFI_STARTPROC - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC -ENDPROC(__copy_from_user_inatomic) - .section .fixup,"ax" /* must zero dest */ ENTRY(bad_from_user) diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c new file mode 100644 index 00000000000..46fc4ee09fc --- /dev/null +++ b/arch/x86/lib/inat.c @@ -0,0 +1,90 @@ +/* + * x86 instruction attribute tables + * + * Written by Masami Hiramatsu <mhiramat@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +#include <asm/insn.h> + +/* Attribute tables are generated from opcode map */ +#include "inat-tables.c" + +/* Attribute search APIs */ +insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode) +{ + return inat_primary_table[opcode]; +} + +insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, insn_byte_t last_pfx, + insn_attr_t esc_attr) +{ + const insn_attr_t *table; + insn_attr_t lpfx_attr; + int n, m = 0; + + n = inat_escape_id(esc_attr); + if (last_pfx) { + lpfx_attr = inat_get_opcode_attribute(last_pfx); + m = inat_last_prefix_id(lpfx_attr); + } + table = inat_escape_tables[n][0]; + if (!table) + return 0; + if (inat_has_variant(table[opcode]) && m) { + table = inat_escape_tables[n][m]; + if (!table) + return 0; + } + return table[opcode]; +} + +insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx, + insn_attr_t grp_attr) +{ + const insn_attr_t *table; + insn_attr_t lpfx_attr; + int n, m = 0; + + n = inat_group_id(grp_attr); + if (last_pfx) { + lpfx_attr = inat_get_opcode_attribute(last_pfx); + m = inat_last_prefix_id(lpfx_attr); + } + table = inat_group_tables[n][0]; + if (!table) + return inat_group_common_attribute(grp_attr); + if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) { + table = inat_group_tables[n][m]; + if (!table) + return inat_group_common_attribute(grp_attr); + } + return table[X86_MODRM_REG(modrm)] | + inat_group_common_attribute(grp_attr); +} + +insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, + insn_byte_t vex_p) +{ + const insn_attr_t *table; + if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX) + return 0; + table = inat_avx_tables[vex_m][vex_p]; + if (!table) + return 0; + return table[opcode]; +} + diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c new file mode 100644 index 00000000000..9f33b984d0e --- /dev/null +++ b/arch/x86/lib/insn.c @@ -0,0 +1,516 @@ +/* + * x86 instruction analysis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004, 2009 + */ + +#include <linux/string.h> +#include <asm/inat.h> +#include <asm/insn.h> + +#define get_next(t, insn) \ + ({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) + +#define peek_next(t, insn) \ + ({t r; r = *(t*)insn->next_byte; r; }) + +#define peek_nbyte_next(t, insn, n) \ + ({t r; r = *(t*)((insn)->next_byte + n); r; }) + +/** + * insn_init() - initialize struct insn + * @insn: &struct insn to be initialized + * @kaddr: address (in kernel memory) of instruction (or copy thereof) + * @x86_64: !0 for 64-bit kernel or 64-bit app + */ +void insn_init(struct insn *insn, const void *kaddr, int x86_64) +{ + memset(insn, 0, sizeof(*insn)); + insn->kaddr = kaddr; + insn->next_byte = kaddr; + insn->x86_64 = x86_64 ? 1 : 0; + insn->opnd_bytes = 4; + if (x86_64) + insn->addr_bytes = 8; + else + insn->addr_bytes = 4; +} + +/** + * insn_get_prefixes - scan x86 instruction prefix bytes + * @insn: &struct insn containing instruction + * + * Populates the @insn->prefixes bitmap, and updates @insn->next_byte + * to point to the (first) opcode. No effect if @insn->prefixes.got + * is already set. + */ +void insn_get_prefixes(struct insn *insn) +{ + struct insn_field *prefixes = &insn->prefixes; + insn_attr_t attr; + insn_byte_t b, lb; + int i, nb; + + if (prefixes->got) + return; + + nb = 0; + lb = 0; + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + while (inat_is_legacy_prefix(attr)) { + /* Skip if same prefix */ + for (i = 0; i < nb; i++) + if (prefixes->bytes[i] == b) + goto found; + if (nb == 4) + /* Invalid instruction */ + break; + prefixes->bytes[nb++] = b; + if (inat_is_address_size_prefix(attr)) { + /* address size switches 2/4 or 4/8 */ + if (insn->x86_64) + insn->addr_bytes ^= 12; + else + insn->addr_bytes ^= 6; + } else if (inat_is_operand_size_prefix(attr)) { + /* oprand size switches 2/4 */ + insn->opnd_bytes ^= 6; + } +found: + prefixes->nbytes++; + insn->next_byte++; + lb = b; + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + } + /* Set the last prefix */ + if (lb && lb != insn->prefixes.bytes[3]) { + if (unlikely(insn->prefixes.bytes[3])) { + /* Swap the last prefix */ + b = insn->prefixes.bytes[3]; + for (i = 0; i < nb; i++) + if (prefixes->bytes[i] == lb) + prefixes->bytes[i] = b; + } + insn->prefixes.bytes[3] = lb; + } + + /* Decode REX prefix */ + if (insn->x86_64) { + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + if (inat_is_rex_prefix(attr)) { + insn->rex_prefix.value = b; + insn->rex_prefix.nbytes = 1; + insn->next_byte++; + if (X86_REX_W(b)) + /* REX.W overrides opnd_size */ + insn->opnd_bytes = 8; + } + } + insn->rex_prefix.got = 1; + + /* Decode VEX prefix */ + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + if (inat_is_vex_prefix(attr)) { + insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); + if (!insn->x86_64) { + /* + * In 32-bits mode, if the [7:6] bits (mod bits of + * ModRM) on the second byte are not 11b, it is + * LDS or LES. + */ + if (X86_MODRM_MOD(b2) != 3) + goto vex_end; + } + insn->vex_prefix.bytes[0] = b; + insn->vex_prefix.bytes[1] = b2; + if (inat_is_vex3_prefix(attr)) { + b2 = peek_nbyte_next(insn_byte_t, insn, 2); + insn->vex_prefix.bytes[2] = b2; + insn->vex_prefix.nbytes = 3; + insn->next_byte += 3; + if (insn->x86_64 && X86_VEX_W(b2)) + /* VEX.W overrides opnd_size */ + insn->opnd_bytes = 8; + } else { + insn->vex_prefix.nbytes = 2; + insn->next_byte += 2; + } + } +vex_end: + insn->vex_prefix.got = 1; + + prefixes->got = 1; + return; +} + +/** + * insn_get_opcode - collect opcode(s) + * @insn: &struct insn containing instruction + * + * Populates @insn->opcode, updates @insn->next_byte to point past the + * opcode byte(s), and set @insn->attr (except for groups). + * If necessary, first collects any preceding (prefix) bytes. + * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got + * is already 1. + */ +void insn_get_opcode(struct insn *insn) +{ + struct insn_field *opcode = &insn->opcode; + insn_byte_t op, pfx; + if (opcode->got) + return; + if (!insn->prefixes.got) + insn_get_prefixes(insn); + + /* Get first opcode */ + op = get_next(insn_byte_t, insn); + opcode->bytes[0] = op; + opcode->nbytes = 1; + + /* Check if there is VEX prefix or not */ + if (insn_is_avx(insn)) { + insn_byte_t m, p; + m = insn_vex_m_bits(insn); + p = insn_vex_p_bits(insn); + insn->attr = inat_get_avx_attribute(op, m, p); + if (!inat_accept_vex(insn->attr)) + insn->attr = 0; /* This instruction is bad */ + goto end; /* VEX has only 1 byte for opcode */ + } + + insn->attr = inat_get_opcode_attribute(op); + while (inat_is_escape(insn->attr)) { + /* Get escaped opcode */ + op = get_next(insn_byte_t, insn); + opcode->bytes[opcode->nbytes++] = op; + pfx = insn_last_prefix(insn); + insn->attr = inat_get_escape_attribute(op, pfx, insn->attr); + } + if (inat_must_vex(insn->attr)) + insn->attr = 0; /* This instruction is bad */ +end: + opcode->got = 1; +} + +/** + * insn_get_modrm - collect ModRM byte, if any + * @insn: &struct insn containing instruction + * + * Populates @insn->modrm and updates @insn->next_byte to point past the + * ModRM byte, if any. If necessary, first collects the preceding bytes + * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1. + */ +void insn_get_modrm(struct insn *insn) +{ + struct insn_field *modrm = &insn->modrm; + insn_byte_t pfx, mod; + if (modrm->got) + return; + if (!insn->opcode.got) + insn_get_opcode(insn); + + if (inat_has_modrm(insn->attr)) { + mod = get_next(insn_byte_t, insn); + modrm->value = mod; + modrm->nbytes = 1; + if (inat_is_group(insn->attr)) { + pfx = insn_last_prefix(insn); + insn->attr = inat_get_group_attribute(mod, pfx, + insn->attr); + } + } + + if (insn->x86_64 && inat_is_force64(insn->attr)) + insn->opnd_bytes = 8; + modrm->got = 1; +} + + +/** + * insn_rip_relative() - Does instruction use RIP-relative addressing mode? + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * ModRM byte. No effect if @insn->x86_64 is 0. + */ +int insn_rip_relative(struct insn *insn) +{ + struct insn_field *modrm = &insn->modrm; + + if (!insn->x86_64) + return 0; + if (!modrm->got) + insn_get_modrm(insn); + /* + * For rip-relative instructions, the mod field (top 2 bits) + * is zero and the r/m field (bottom 3 bits) is 0x5. + */ + return (modrm->nbytes && (modrm->value & 0xc7) == 0x5); +} + +/** + * insn_get_sib() - Get the SIB byte of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * ModRM byte. + */ +void insn_get_sib(struct insn *insn) +{ + insn_byte_t modrm; + + if (insn->sib.got) + return; + if (!insn->modrm.got) + insn_get_modrm(insn); + if (insn->modrm.nbytes) { + modrm = (insn_byte_t)insn->modrm.value; + if (insn->addr_bytes != 2 && + X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) { + insn->sib.value = get_next(insn_byte_t, insn); + insn->sib.nbytes = 1; + } + } + insn->sib.got = 1; +} + + +/** + * insn_get_displacement() - Get the displacement of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * SIB byte. + * Displacement value is sign-expanded. + */ +void insn_get_displacement(struct insn *insn) +{ + insn_byte_t mod, rm, base; + + if (insn->displacement.got) + return; + if (!insn->sib.got) + insn_get_sib(insn); + if (insn->modrm.nbytes) { + /* + * Interpreting the modrm byte: + * mod = 00 - no displacement fields (exceptions below) + * mod = 01 - 1-byte displacement field + * mod = 10 - displacement field is 4 bytes, or 2 bytes if + * address size = 2 (0x67 prefix in 32-bit mode) + * mod = 11 - no memory operand + * + * If address size = 2... + * mod = 00, r/m = 110 - displacement field is 2 bytes + * + * If address size != 2... + * mod != 11, r/m = 100 - SIB byte exists + * mod = 00, SIB base = 101 - displacement field is 4 bytes + * mod = 00, r/m = 101 - rip-relative addressing, displacement + * field is 4 bytes + */ + mod = X86_MODRM_MOD(insn->modrm.value); + rm = X86_MODRM_RM(insn->modrm.value); + base = X86_SIB_BASE(insn->sib.value); + if (mod == 3) + goto out; + if (mod == 1) { + insn->displacement.value = get_next(char, insn); + insn->displacement.nbytes = 1; + } else if (insn->addr_bytes == 2) { + if ((mod == 0 && rm == 6) || mod == 2) { + insn->displacement.value = + get_next(short, insn); + insn->displacement.nbytes = 2; + } + } else { + if ((mod == 0 && rm == 5) || mod == 2 || + (mod == 0 && base == 5)) { + insn->displacement.value = get_next(int, insn); + insn->displacement.nbytes = 4; + } + } + } +out: + insn->displacement.got = 1; +} + +/* Decode moffset16/32/64 */ +static void __get_moffset(struct insn *insn) +{ + switch (insn->addr_bytes) { + case 2: + insn->moffset1.value = get_next(short, insn); + insn->moffset1.nbytes = 2; + break; + case 4: + insn->moffset1.value = get_next(int, insn); + insn->moffset1.nbytes = 4; + break; + case 8: + insn->moffset1.value = get_next(int, insn); + insn->moffset1.nbytes = 4; + insn->moffset2.value = get_next(int, insn); + insn->moffset2.nbytes = 4; + break; + } + insn->moffset1.got = insn->moffset2.got = 1; +} + +/* Decode imm v32(Iz) */ +static void __get_immv32(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate.value = get_next(short, insn); + insn->immediate.nbytes = 2; + break; + case 4: + case 8: + insn->immediate.value = get_next(int, insn); + insn->immediate.nbytes = 4; + break; + } +} + +/* Decode imm v64(Iv/Ov) */ +static void __get_immv(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate1.value = get_next(short, insn); + insn->immediate1.nbytes = 2; + break; + case 4: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + break; + case 8: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + insn->immediate2.value = get_next(int, insn); + insn->immediate2.nbytes = 4; + break; + } + insn->immediate1.got = insn->immediate2.got = 1; +} + +/* Decode ptr16:16/32(Ap) */ +static void __get_immptr(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate1.value = get_next(short, insn); + insn->immediate1.nbytes = 2; + break; + case 4: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + break; + case 8: + /* ptr16:64 is not exist (no segment) */ + return; + } + insn->immediate2.value = get_next(unsigned short, insn); + insn->immediate2.nbytes = 2; + insn->immediate1.got = insn->immediate2.got = 1; +} + +/** + * insn_get_immediate() - Get the immediates of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * displacement bytes. + * Basically, most of immediates are sign-expanded. Unsigned-value can be + * get by bit masking with ((1 << (nbytes * 8)) - 1) + */ +void insn_get_immediate(struct insn *insn) +{ + if (insn->immediate.got) + return; + if (!insn->displacement.got) + insn_get_displacement(insn); + + if (inat_has_moffset(insn->attr)) { + __get_moffset(insn); + goto done; + } + + if (!inat_has_immediate(insn->attr)) + /* no immediates */ + goto done; + + switch (inat_immediate_size(insn->attr)) { + case INAT_IMM_BYTE: + insn->immediate.value = get_next(char, insn); + insn->immediate.nbytes = 1; + break; + case INAT_IMM_WORD: + insn->immediate.value = get_next(short, insn); + insn->immediate.nbytes = 2; + break; + case INAT_IMM_DWORD: + insn->immediate.value = get_next(int, insn); + insn->immediate.nbytes = 4; + break; + case INAT_IMM_QWORD: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + insn->immediate2.value = get_next(int, insn); + insn->immediate2.nbytes = 4; + break; + case INAT_IMM_PTR: + __get_immptr(insn); + break; + case INAT_IMM_VWORD32: + __get_immv32(insn); + break; + case INAT_IMM_VWORD: + __get_immv(insn); + break; + default: + break; + } + if (inat_has_second_immediate(insn->attr)) { + insn->immediate2.value = get_next(char, insn); + insn->immediate2.nbytes = 1; + } +done: + insn->immediate.got = 1; +} + +/** + * insn_get_length() - Get the length of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * immediates bytes. + */ +void insn_get_length(struct insn *insn) +{ + if (insn->length) + return; + if (!insn->immediate.got) + insn_get_immediate(insn); + insn->length = (unsigned char)((unsigned long)insn->next_byte + - (unsigned long)insn->kaddr); +} diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 1f118d462ac..e218d5df85f 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -874,7 +874,7 @@ EXPORT_SYMBOL(copy_to_user); * data to the requested size using zero bytes. */ unsigned long -copy_from_user(void *to, const void __user *from, unsigned long n) +_copy_from_user(void *to, const void __user *from, unsigned long n) { if (access_ok(VERIFY_READ, from, n)) n = __copy_from_user(to, from, n); @@ -882,4 +882,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n) memset(to, 0, n); return n; } -EXPORT_SYMBOL(copy_from_user); +EXPORT_SYMBOL(_copy_from_user); + +void copy_from_user_overflow(void) +{ + WARN(1, "Buffer overflow detected!\n"); +} +EXPORT_SYMBOL(copy_from_user_overflow); diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt new file mode 100644 index 00000000000..a793da5e560 --- /dev/null +++ b/arch/x86/lib/x86-opcode-map.txt @@ -0,0 +1,893 @@ +# x86 Opcode Maps +# +#<Opcode maps> +# Table: table-name +# Referrer: escaped-name +# AVXcode: avx-code +# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] +# (or) +# opcode: escape # escaped-name +# EndTable +# +#<group maps> +# GrpTable: GrpXXX +# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] +# EndTable +# +# AVX Superscripts +# (VEX): this opcode can accept VEX prefix. +# (oVEX): this opcode requires VEX prefix. +# (o128): this opcode only supports 128bit VEX. +# (o256): this opcode only supports 256bit VEX. +# + +Table: one byte opcode +Referrer: +AVXcode: +# 0x00 - 0x0f +00: ADD Eb,Gb +01: ADD Ev,Gv +02: ADD Gb,Eb +03: ADD Gv,Ev +04: ADD AL,Ib +05: ADD rAX,Iz +06: PUSH ES (i64) +07: POP ES (i64) +08: OR Eb,Gb +09: OR Ev,Gv +0a: OR Gb,Eb +0b: OR Gv,Ev +0c: OR AL,Ib +0d: OR rAX,Iz +0e: PUSH CS (i64) +0f: escape # 2-byte escape +# 0x10 - 0x1f +10: ADC Eb,Gb +11: ADC Ev,Gv +12: ADC Gb,Eb +13: ADC Gv,Ev +14: ADC AL,Ib +15: ADC rAX,Iz +16: PUSH SS (i64) +17: POP SS (i64) +18: SBB Eb,Gb +19: SBB Ev,Gv +1a: SBB Gb,Eb +1b: SBB Gv,Ev +1c: SBB AL,Ib +1d: SBB rAX,Iz +1e: PUSH DS (i64) +1f: POP DS (i64) +# 0x20 - 0x2f +20: AND Eb,Gb +21: AND Ev,Gv +22: AND Gb,Eb +23: AND Gv,Ev +24: AND AL,Ib +25: AND rAx,Iz +26: SEG=ES (Prefix) +27: DAA (i64) +28: SUB Eb,Gb +29: SUB Ev,Gv +2a: SUB Gb,Eb +2b: SUB Gv,Ev +2c: SUB AL,Ib +2d: SUB rAX,Iz +2e: SEG=CS (Prefix) +2f: DAS (i64) +# 0x30 - 0x3f +30: XOR Eb,Gb +31: XOR Ev,Gv +32: XOR Gb,Eb +33: XOR Gv,Ev +34: XOR AL,Ib +35: XOR rAX,Iz +36: SEG=SS (Prefix) +37: AAA (i64) +38: CMP Eb,Gb +39: CMP Ev,Gv +3a: CMP Gb,Eb +3b: CMP Gv,Ev +3c: CMP AL,Ib +3d: CMP rAX,Iz +3e: SEG=DS (Prefix) +3f: AAS (i64) +# 0x40 - 0x4f +40: INC eAX (i64) | REX (o64) +41: INC eCX (i64) | REX.B (o64) +42: INC eDX (i64) | REX.X (o64) +43: INC eBX (i64) | REX.XB (o64) +44: INC eSP (i64) | REX.R (o64) +45: INC eBP (i64) | REX.RB (o64) +46: INC eSI (i64) | REX.RX (o64) +47: INC eDI (i64) | REX.RXB (o64) +48: DEC eAX (i64) | REX.W (o64) +49: DEC eCX (i64) | REX.WB (o64) +4a: DEC eDX (i64) | REX.WX (o64) +4b: DEC eBX (i64) | REX.WXB (o64) +4c: DEC eSP (i64) | REX.WR (o64) +4d: DEC eBP (i64) | REX.WRB (o64) +4e: DEC eSI (i64) | REX.WRX (o64) +4f: DEC eDI (i64) | REX.WRXB (o64) +# 0x50 - 0x5f +50: PUSH rAX/r8 (d64) +51: PUSH rCX/r9 (d64) +52: PUSH rDX/r10 (d64) +53: PUSH rBX/r11 (d64) +54: PUSH rSP/r12 (d64) +55: PUSH rBP/r13 (d64) +56: PUSH rSI/r14 (d64) +57: PUSH rDI/r15 (d64) +58: POP rAX/r8 (d64) +59: POP rCX/r9 (d64) +5a: POP rDX/r10 (d64) +5b: POP rBX/r11 (d64) +5c: POP rSP/r12 (d64) +5d: POP rBP/r13 (d64) +5e: POP rSI/r14 (d64) +5f: POP rDI/r15 (d64) +# 0x60 - 0x6f +60: PUSHA/PUSHAD (i64) +61: POPA/POPAD (i64) +62: BOUND Gv,Ma (i64) +63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64) +64: SEG=FS (Prefix) +65: SEG=GS (Prefix) +66: Operand-Size (Prefix) +67: Address-Size (Prefix) +68: PUSH Iz (d64) +69: IMUL Gv,Ev,Iz +6a: PUSH Ib (d64) +6b: IMUL Gv,Ev,Ib +6c: INS/INSB Yb,DX +6d: INS/INSW/INSD Yz,DX +6e: OUTS/OUTSB DX,Xb +6f: OUTS/OUTSW/OUTSD DX,Xz +# 0x70 - 0x7f +70: JO Jb +71: JNO Jb +72: JB/JNAE/JC Jb +73: JNB/JAE/JNC Jb +74: JZ/JE Jb +75: JNZ/JNE Jb +76: JBE/JNA Jb +77: JNBE/JA Jb +78: JS Jb +79: JNS Jb +7a: JP/JPE Jb +7b: JNP/JPO Jb +7c: JL/JNGE Jb +7d: JNL/JGE Jb +7e: JLE/JNG Jb +7f: JNLE/JG Jb +# 0x80 - 0x8f +80: Grp1 Eb,Ib (1A) +81: Grp1 Ev,Iz (1A) +82: Grp1 Eb,Ib (1A),(i64) +83: Grp1 Ev,Ib (1A) +84: TEST Eb,Gb +85: TEST Ev,Gv +86: XCHG Eb,Gb +87: XCHG Ev,Gv +88: MOV Eb,Gb +89: MOV Ev,Gv +8a: MOV Gb,Eb +8b: MOV Gv,Ev +8c: MOV Ev,Sw +8d: LEA Gv,M +8e: MOV Sw,Ew +8f: Grp1A (1A) | POP Ev (d64) +# 0x90 - 0x9f +90: NOP | PAUSE (F3) | XCHG r8,rAX +91: XCHG rCX/r9,rAX +92: XCHG rDX/r10,rAX +93: XCHG rBX/r11,rAX +94: XCHG rSP/r12,rAX +95: XCHG rBP/r13,rAX +96: XCHG rSI/r14,rAX +97: XCHG rDI/r15,rAX +98: CBW/CWDE/CDQE +99: CWD/CDQ/CQO +9a: CALLF Ap (i64) +9b: FWAIT/WAIT +9c: PUSHF/D/Q Fv (d64) +9d: POPF/D/Q Fv (d64) +9e: SAHF +9f: LAHF +# 0xa0 - 0xaf +a0: MOV AL,Ob +a1: MOV rAX,Ov +a2: MOV Ob,AL +a3: MOV Ov,rAX +a4: MOVS/B Xb,Yb +a5: MOVS/W/D/Q Xv,Yv +a6: CMPS/B Xb,Yb +a7: CMPS/W/D Xv,Yv +a8: TEST AL,Ib +a9: TEST rAX,Iz +aa: STOS/B Yb,AL +ab: STOS/W/D/Q Yv,rAX +ac: LODS/B AL,Xb +ad: LODS/W/D/Q rAX,Xv +ae: SCAS/B AL,Yb +af: SCAS/W/D/Q rAX,Xv +# 0xb0 - 0xbf +b0: MOV AL/R8L,Ib +b1: MOV CL/R9L,Ib +b2: MOV DL/R10L,Ib +b3: MOV BL/R11L,Ib +b4: MOV AH/R12L,Ib +b5: MOV CH/R13L,Ib +b6: MOV DH/R14L,Ib +b7: MOV BH/R15L,Ib +b8: MOV rAX/r8,Iv +b9: MOV rCX/r9,Iv +ba: MOV rDX/r10,Iv +bb: MOV rBX/r11,Iv +bc: MOV rSP/r12,Iv +bd: MOV rBP/r13,Iv +be: MOV rSI/r14,Iv +bf: MOV rDI/r15,Iv +# 0xc0 - 0xcf +c0: Grp2 Eb,Ib (1A) +c1: Grp2 Ev,Ib (1A) +c2: RETN Iw (f64) +c3: RETN +c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix) +c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix) +c6: Grp11 Eb,Ib (1A) +c7: Grp11 Ev,Iz (1A) +c8: ENTER Iw,Ib +c9: LEAVE (d64) +ca: RETF Iw +cb: RETF +cc: INT3 +cd: INT Ib +ce: INTO (i64) +cf: IRET/D/Q +# 0xd0 - 0xdf +d0: Grp2 Eb,1 (1A) +d1: Grp2 Ev,1 (1A) +d2: Grp2 Eb,CL (1A) +d3: Grp2 Ev,CL (1A) +d4: AAM Ib (i64) +d5: AAD Ib (i64) +d6: +d7: XLAT/XLATB +d8: ESC +d9: ESC +da: ESC +db: ESC +dc: ESC +dd: ESC +de: ESC +df: ESC +# 0xe0 - 0xef +e0: LOOPNE/LOOPNZ Jb (f64) +e1: LOOPE/LOOPZ Jb (f64) +e2: LOOP Jb (f64) +e3: JrCXZ Jb (f64) +e4: IN AL,Ib +e5: IN eAX,Ib +e6: OUT Ib,AL +e7: OUT Ib,eAX +e8: CALL Jz (f64) +e9: JMP-near Jz (f64) +ea: JMP-far Ap (i64) +eb: JMP-short Jb (f64) +ec: IN AL,DX +ed: IN eAX,DX +ee: OUT DX,AL +ef: OUT DX,eAX +# 0xf0 - 0xff +f0: LOCK (Prefix) +f1: +f2: REPNE (Prefix) +f3: REP/REPE (Prefix) +f4: HLT +f5: CMC +f6: Grp3_1 Eb (1A) +f7: Grp3_2 Ev (1A) +f8: CLC +f9: STC +fa: CLI +fb: STI +fc: CLD +fd: STD +fe: Grp4 (1A) +ff: Grp5 (1A) +EndTable + +Table: 2-byte opcode (0x0f) +Referrer: 2-byte escape +AVXcode: 1 +# 0x0f 0x00-0x0f +00: Grp6 (1A) +01: Grp7 (1A) +02: LAR Gv,Ew +03: LSL Gv,Ew +04: +05: SYSCALL (o64) +06: CLTS +07: SYSRET (o64) +08: INVD +09: WBINVD +0a: +0b: UD2 (1B) +0c: +0d: NOP Ev | GrpP +0e: FEMMS +# 3DNow! uses the last imm byte as opcode extension. +0f: 3DNow! Pq,Qq,Ib +# 0x0f 0x10-0x1f +10: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128) +11: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128) +12: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX) +13: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128) +14: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX) +15: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX) +16: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX) +17: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128) +18: Grp16 (1A) +19: +1a: +1b: +1c: +1d: +1e: +1f: NOP Ev +# 0x0f 0x20-0x2f +20: MOV Rd,Cd +21: MOV Rd,Dd +22: MOV Cd,Rd +23: MOV Dd,Rd +24: +25: +26: +27: +28: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX) +29: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX) +2a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128) +2b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX) +2c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128) +2d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128) +2e: ucomiss Vss,Wss (VEX),(o128) | ucomisd Vsd,Wsd (66),(VEX),(o128) +2f: comiss Vss,Wss (VEX),(o128) | comisd Vsd,Wsd (66),(VEX),(o128) +# 0x0f 0x30-0x3f +30: WRMSR +31: RDTSC +32: RDMSR +33: RDPMC +34: SYSENTER +35: SYSEXIT +36: +37: GETSEC +38: escape # 3-byte escape 1 +39: +3a: escape # 3-byte escape 2 +3b: +3c: +3d: +3e: +3f: +# 0x0f 0x40-0x4f +40: CMOVO Gv,Ev +41: CMOVNO Gv,Ev +42: CMOVB/C/NAE Gv,Ev +43: CMOVAE/NB/NC Gv,Ev +44: CMOVE/Z Gv,Ev +45: CMOVNE/NZ Gv,Ev +46: CMOVBE/NA Gv,Ev +47: CMOVA/NBE Gv,Ev +48: CMOVS Gv,Ev +49: CMOVNS Gv,Ev +4a: CMOVP/PE Gv,Ev +4b: CMOVNP/PO Gv,Ev +4c: CMOVL/NGE Gv,Ev +4d: CMOVNL/GE Gv,Ev +4e: CMOVLE/NG Gv,Ev +4f: CMOVNLE/G Gv,Ev +# 0x0f 0x50-0x5f +50: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX) +51: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128) +52: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128) +53: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128) +54: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX) +55: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX) +56: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX) +57: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX) +58: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128) +59: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128) +5a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128) +5b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX) +5c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128) +5d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128) +5e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128) +5f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128) +# 0x0f 0x60-0x6f +60: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128) +61: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128) +62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128) +63: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128) +64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128) +65: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128) +66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128) +67: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128) +68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128) +69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128) +6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128) +6b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128) +6c: punpcklqdq Vdq,Wdq (66),(VEX),(o128) +6d: punpckhqdq Vdq,Wdq (66),(VEX),(o128) +6e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128) +6f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX) +# 0x0f 0x70-0x7f +70: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128) +71: Grp12 (1A) +72: Grp13 (1A) +73: Grp14 (1A) +74: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128) +75: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128) +76: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128) +77: emms/vzeroupper/vzeroall (VEX) +78: VMREAD Ed/q,Gd/q +79: VMWRITE Gd/q,Ed/q +7a: +7b: +7c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX) +7d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX) +7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128) +7f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX) +# 0x0f 0x80-0x8f +80: JO Jz (f64) +81: JNO Jz (f64) +82: JB/JNAE/JC Jz (f64) +83: JNB/JAE/JNC Jz (f64) +84: JZ/JE Jz (f64) +85: JNZ/JNE Jz (f64) +86: JBE/JNA Jz (f64) +87: JNBE/JA Jz (f64) +88: JS Jz (f64) +89: JNS Jz (f64) +8a: JP/JPE Jz (f64) +8b: JNP/JPO Jz (f64) +8c: JL/JNGE Jz (f64) +8d: JNL/JGE Jz (f64) +8e: JLE/JNG Jz (f64) +8f: JNLE/JG Jz (f64) +# 0x0f 0x90-0x9f +90: SETO Eb +91: SETNO Eb +92: SETB/C/NAE Eb +93: SETAE/NB/NC Eb +94: SETE/Z Eb +95: SETNE/NZ Eb +96: SETBE/NA Eb +97: SETA/NBE Eb +98: SETS Eb +99: SETNS Eb +9a: SETP/PE Eb +9b: SETNP/PO Eb +9c: SETL/NGE Eb +9d: SETNL/GE Eb +9e: SETLE/NG Eb +9f: SETNLE/G Eb +# 0x0f 0xa0-0xaf +a0: PUSH FS (d64) +a1: POP FS (d64) +a2: CPUID +a3: BT Ev,Gv +a4: SHLD Ev,Gv,Ib +a5: SHLD Ev,Gv,CL +a6: GrpPDLK +a7: GrpRNG +a8: PUSH GS (d64) +a9: POP GS (d64) +aa: RSM +ab: BTS Ev,Gv +ac: SHRD Ev,Gv,Ib +ad: SHRD Ev,Gv,CL +ae: Grp15 (1A),(1C) +af: IMUL Gv,Ev +# 0x0f 0xb0-0xbf +b0: CMPXCHG Eb,Gb +b1: CMPXCHG Ev,Gv +b2: LSS Gv,Mp +b3: BTR Ev,Gv +b4: LFS Gv,Mp +b5: LGS Gv,Mp +b6: MOVZX Gv,Eb +b7: MOVZX Gv,Ew +b8: JMPE | POPCNT Gv,Ev (F3) +b9: Grp10 (1A) +ba: Grp8 Ev,Ib (1A) +bb: BTC Ev,Gv +bc: BSF Gv,Ev +bd: BSR Gv,Ev +be: MOVSX Gv,Eb +bf: MOVSX Gv,Ew +# 0x0f 0xc0-0xcf +c0: XADD Eb,Gb +c1: XADD Ev,Gv +c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX) +c3: movnti Md/q,Gd/q +c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128) +c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128) +c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX) +c7: Grp9 (1A) +c8: BSWAP RAX/EAX/R8/R8D +c9: BSWAP RCX/ECX/R9/R9D +ca: BSWAP RDX/EDX/R10/R10D +cb: BSWAP RBX/EBX/R11/R11D +cc: BSWAP RSP/ESP/R12/R12D +cd: BSWAP RBP/EBP/R13/R13D +ce: BSWAP RSI/ESI/R14/R14D +cf: BSWAP RDI/EDI/R15/R15D +# 0x0f 0xd0-0xdf +d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX) +d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128) +d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128) +d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128) +d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128) +d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128) +d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2) +d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128) +d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128) +d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128) +da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128) +db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128) +dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128) +dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128) +de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128) +df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128) +# 0x0f 0xe0-0xef +e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128) +e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128) +e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128) +e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128) +e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128) +e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128) +e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX) +e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX) +e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128) +e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128) +ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128) +eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128) +ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128) +ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128) +ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128) +ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128) +# 0x0f 0xf0-0xff +f0: lddqu Vdq,Mdq (F2),(VEX) +f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128) +f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128) +f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128) +f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128) +f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128) +f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128) +f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128) +f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128) +f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128) +fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128) +fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128) +fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128) +fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128) +fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128) +ff: +EndTable + +Table: 3-byte opcode 1 (0x0f 0x38) +Referrer: 3-byte escape 1 +AVXcode: 2 +# 0x0f 0x38 0x00-0x0f +00: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128) +01: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128) +02: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128) +03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128) +04: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128) +05: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128) +06: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128) +07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128) +08: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128) +09: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128) +0a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128) +0b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128) +0c: Vpermilps /r (66),(oVEX) +0d: Vpermilpd /r (66),(oVEX) +0e: vtestps /r (66),(oVEX) +0f: vtestpd /r (66),(oVEX) +# 0x0f 0x38 0x10-0x1f +10: pblendvb Vdq,Wdq (66) +11: +12: +13: +14: blendvps Vdq,Wdq (66) +15: blendvpd Vdq,Wdq (66) +16: +17: ptest Vdq,Wdq (66),(VEX) +18: vbroadcastss /r (66),(oVEX) +19: vbroadcastsd /r (66),(oVEX),(o256) +1a: vbroadcastf128 /r (66),(oVEX),(o256) +1b: +1c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128) +1d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128) +1e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128) +1f: +# 0x0f 0x38 0x20-0x2f +20: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128) +21: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128) +22: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128) +23: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128) +24: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128) +25: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128) +26: +27: +28: pmuldq Vdq,Wdq (66),(VEX),(o128) +29: pcmpeqq Vdq,Wdq (66),(VEX),(o128) +2a: movntdqa Vdq,Mdq (66),(VEX),(o128) +2b: packusdw Vdq,Wdq (66),(VEX),(o128) +2c: vmaskmovps(ld) /r (66),(oVEX) +2d: vmaskmovpd(ld) /r (66),(oVEX) +2e: vmaskmovps(st) /r (66),(oVEX) +2f: vmaskmovpd(st) /r (66),(oVEX) +# 0x0f 0x38 0x30-0x3f +30: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128) +31: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128) +32: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128) +33: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128) +34: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128) +35: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128) +36: +37: pcmpgtq Vdq,Wdq (66),(VEX),(o128) +38: pminsb Vdq,Wdq (66),(VEX),(o128) +39: pminsd Vdq,Wdq (66),(VEX),(o128) +3a: pminuw Vdq,Wdq (66),(VEX),(o128) +3b: pminud Vdq,Wdq (66),(VEX),(o128) +3c: pmaxsb Vdq,Wdq (66),(VEX),(o128) +3d: pmaxsd Vdq,Wdq (66),(VEX),(o128) +3e: pmaxuw Vdq,Wdq (66),(VEX),(o128) +3f: pmaxud Vdq,Wdq (66),(VEX),(o128) +# 0x0f 0x38 0x40-0x8f +40: pmulld Vdq,Wdq (66),(VEX),(o128) +41: phminposuw Vdq,Wdq (66),(VEX),(o128) +80: INVEPT Gd/q,Mdq (66) +81: INVPID Gd/q,Mdq (66) +# 0x0f 0x38 0x90-0xbf (FMA) +96: vfmaddsub132pd/ps /r (66),(VEX) +97: vfmsubadd132pd/ps /r (66),(VEX) +98: vfmadd132pd/ps /r (66),(VEX) +99: vfmadd132sd/ss /r (66),(VEX),(o128) +9a: vfmsub132pd/ps /r (66),(VEX) +9b: vfmsub132sd/ss /r (66),(VEX),(o128) +9c: vfnmadd132pd/ps /r (66),(VEX) +9d: vfnmadd132sd/ss /r (66),(VEX),(o128) +9e: vfnmsub132pd/ps /r (66),(VEX) +9f: vfnmsub132sd/ss /r (66),(VEX),(o128) +a6: vfmaddsub213pd/ps /r (66),(VEX) +a7: vfmsubadd213pd/ps /r (66),(VEX) +a8: vfmadd213pd/ps /r (66),(VEX) +a9: vfmadd213sd/ss /r (66),(VEX),(o128) +aa: vfmsub213pd/ps /r (66),(VEX) +ab: vfmsub213sd/ss /r (66),(VEX),(o128) +ac: vfnmadd213pd/ps /r (66),(VEX) +ad: vfnmadd213sd/ss /r (66),(VEX),(o128) +ae: vfnmsub213pd/ps /r (66),(VEX) +af: vfnmsub213sd/ss /r (66),(VEX),(o128) +b6: vfmaddsub231pd/ps /r (66),(VEX) +b7: vfmsubadd231pd/ps /r (66),(VEX) +b8: vfmadd231pd/ps /r (66),(VEX) +b9: vfmadd231sd/ss /r (66),(VEX),(o128) +ba: vfmsub231pd/ps /r (66),(VEX) +bb: vfmsub231sd/ss /r (66),(VEX),(o128) +bc: vfnmadd231pd/ps /r (66),(VEX) +bd: vfnmadd231sd/ss /r (66),(VEX),(o128) +be: vfnmsub231pd/ps /r (66),(VEX) +bf: vfnmsub231sd/ss /r (66),(VEX),(o128) +# 0x0f 0x38 0xc0-0xff +db: aesimc Vdq,Wdq (66),(VEX),(o128) +dc: aesenc Vdq,Wdq (66),(VEX),(o128) +dd: aesenclast Vdq,Wdq (66),(VEX),(o128) +de: aesdec Vdq,Wdq (66),(VEX),(o128) +df: aesdeclast Vdq,Wdq (66),(VEX),(o128) +f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2) +f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2) +EndTable + +Table: 3-byte opcode 2 (0x0f 0x3a) +Referrer: 3-byte escape 2 +AVXcode: 3 +# 0x0f 0x3a 0x00-0xff +04: vpermilps /r,Ib (66),(oVEX) +05: vpermilpd /r,Ib (66),(oVEX) +06: vperm2f128 /r,Ib (66),(oVEX),(o256) +08: roundps Vdq,Wdq,Ib (66),(VEX) +09: roundpd Vdq,Wdq,Ib (66),(VEX) +0a: roundss Vss,Wss,Ib (66),(VEX),(o128) +0b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128) +0c: blendps Vdq,Wdq,Ib (66),(VEX) +0d: blendpd Vdq,Wdq,Ib (66),(VEX) +0e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128) +0f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128) +14: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128) +15: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128) +16: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128) +17: extractps Ed,Vdq,Ib (66),(VEX),(o128) +18: vinsertf128 /r,Ib (66),(oVEX),(o256) +19: vextractf128 /r,Ib (66),(oVEX),(o256) +20: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128) +21: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128) +22: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128) +40: dpps Vdq,Wdq,Ib (66),(VEX) +41: dppd Vdq,Wdq,Ib (66),(VEX),(o128) +42: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128) +44: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128) +4a: vblendvps /r,Ib (66),(oVEX) +4b: vblendvpd /r,Ib (66),(oVEX) +4c: vpblendvb /r,Ib (66),(oVEX),(o128) +60: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128) +61: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128) +62: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128) +63: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128) +df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128) +EndTable + +GrpTable: Grp1 +0: ADD +1: OR +2: ADC +3: SBB +4: AND +5: SUB +6: XOR +7: CMP +EndTable + +GrpTable: Grp1A +0: POP +EndTable + +GrpTable: Grp2 +0: ROL +1: ROR +2: RCL +3: RCR +4: SHL/SAL +5: SHR +6: +7: SAR +EndTable + +GrpTable: Grp3_1 +0: TEST Eb,Ib +1: +2: NOT Eb +3: NEG Eb +4: MUL AL,Eb +5: IMUL AL,Eb +6: DIV AL,Eb +7: IDIV AL,Eb +EndTable + +GrpTable: Grp3_2 +0: TEST Ev,Iz +1: +2: NOT Ev +3: NEG Ev +4: MUL rAX,Ev +5: IMUL rAX,Ev +6: DIV rAX,Ev +7: IDIV rAX,Ev +EndTable + +GrpTable: Grp4 +0: INC Eb +1: DEC Eb +EndTable + +GrpTable: Grp5 +0: INC Ev +1: DEC Ev +2: CALLN Ev (f64) +3: CALLF Ep +4: JMPN Ev (f64) +5: JMPF Ep +6: PUSH Ev (d64) +7: +EndTable + +GrpTable: Grp6 +0: SLDT Rv/Mw +1: STR Rv/Mw +2: LLDT Ew +3: LTR Ew +4: VERR Ew +5: VERW Ew +EndTable + +GrpTable: Grp7 +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) +3: LIDT Ms +4: SMSW Mw/Rv +5: +6: LMSW Ew +7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B) +EndTable + +GrpTable: Grp8 +4: BT +5: BTS +6: BTR +7: BTC +EndTable + +GrpTable: Grp9 +1: CMPXCHG8B/16B Mq/Mdq +6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) +7: VMPTRST Mq +EndTable + +GrpTable: Grp10 +EndTable + +GrpTable: Grp11 +0: MOV +EndTable + +GrpTable: Grp12 +2: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128) +4: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128) +6: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128) +EndTable + +GrpTable: Grp13 +2: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128) +4: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128) +6: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128) +EndTable + +GrpTable: Grp14 +2: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128) +3: psrldq Udq,Ib (66),(11B),(VEX),(o128) +6: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128) +7: pslldq Udq,Ib (66),(11B),(VEX),(o128) +EndTable + +GrpTable: Grp15 +0: fxsave +1: fxstor +2: ldmxcsr (VEX) +3: stmxcsr (VEX) +4: XSAVE +5: XRSTOR | lfence (11B) +6: mfence (11B) +7: clflush | sfence (11B) +EndTable + +GrpTable: Grp16 +0: prefetch NTA +1: prefetch T0 +2: prefetch T1 +3: prefetch T2 +EndTable + +# AMD's Prefetch Group +GrpTable: GrpP +0: PREFETCH +1: PREFETCHW +EndTable + +GrpTable: GrpPDLK +0: MONTMUL +1: XSHA1 +2: XSHA2 +EndTable + +GrpTable: GrpRNG +0: xstore-rng +1: xcrypt-ecb +2: xcrypt-cbc +4: xcrypt-cfb +5: xcrypt-ofb +EndTable diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index eefdeee8a87..06630d26e56 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,10 @@ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o gup.o + pat.o pgtable.o physaddr.o gup.o setup_nx.o + +# Make sure __phys_addr has no stackprotector +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_physaddr.o := $(nostackp) +CFLAGS_setup_nx.o := $(nostackp) obj-$(CONFIG_SMP) += tlb.o diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 61b41ca3b5a..d0474ad2a6e 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -35,34 +35,3 @@ int fixup_exception(struct pt_regs *regs) return 0; } - -#ifdef CONFIG_X86_64 -/* - * Need to defined our own search_extable on X86_64 to work around - * a B stepping K8 bug. - */ -const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, - unsigned long value) -{ - /* B stepping K8 bug */ - if ((value >> 32) == 0) - value |= 0xffffffffUL << 32; - - while (first <= last) { - const struct exception_table_entry *mid; - long diff; - - mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) - return mid; - else if (diff < 0) - first = mid+1; - else - last = mid-1; - } - return NULL; -} -#endif diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 775a020990a..8f4e2ac9392 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -10,7 +10,7 @@ #include <linux/bootmem.h> /* max_low_pfn */ #include <linux/kprobes.h> /* __kprobes, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ -#include <linux/perf_counter.h> /* perf_swcounter_event */ +#include <linux/perf_event.h> /* perf_sw_event */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ @@ -38,7 +38,8 @@ enum x86_pf_error_code { * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: */ -static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) +static inline int __kprobes +kmmio_fault(struct pt_regs *regs, unsigned long addr) { if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) @@ -46,7 +47,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) return 0; } -static inline int notify_page_fault(struct pt_regs *regs) +static inline int __kprobes notify_page_fault(struct pt_regs *regs) { int ret = 0; @@ -167,6 +168,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, info.si_errno = 0; info.si_code = si_code; info.si_addr = (void __user *)address; + info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; force_sig_info(si_signo, &info, tsk); } @@ -239,7 +241,7 @@ void vmalloc_sync_all(void) * * Handle a fault on the vmalloc or module mapping area */ -static noinline int vmalloc_fault(unsigned long address) +static noinline __kprobes int vmalloc_fault(unsigned long address) { unsigned long pgd_paddr; pmd_t *pmd_k; @@ -356,7 +358,7 @@ void vmalloc_sync_all(void) * * This assumes no large pages in there. */ -static noinline int vmalloc_fault(unsigned long address) +static noinline __kprobes int vmalloc_fault(unsigned long address) { pgd_t *pgd, *pgd_ref; pud_t *pud, *pud_ref; @@ -790,10 +792,12 @@ out_of_memory(struct pt_regs *regs, unsigned long error_code, } static void -do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) +do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, + unsigned int fault) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; + int code = BUS_ADRERR; up_read(&mm->mmap_sem); @@ -809,7 +813,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) tsk->thread.error_code = error_code; tsk->thread.trap_no = 14; - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); +#ifdef CONFIG_MEMORY_FAILURE + if (fault & VM_FAULT_HWPOISON) { + printk(KERN_ERR + "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", + tsk->comm, tsk->pid, address); + code = BUS_MCEERR_AR; + } +#endif + force_sig_info_fault(SIGBUS, code, address, tsk); } static noinline void @@ -819,8 +831,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, if (fault & VM_FAULT_OOM) { out_of_memory(regs, error_code, address); } else { - if (fault & VM_FAULT_SIGBUS) - do_sigbus(regs, error_code, address); + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) + do_sigbus(regs, error_code, address, fault); else BUG(); } @@ -849,7 +861,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static noinline int +static noinline __kprobes int spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; @@ -1017,7 +1029,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -1114,11 +1126,11 @@ good_area: if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 1617958a380..63a6ba66cbe 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -104,6 +104,7 @@ EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); EXPORT_SYMBOL(kmap_atomic_prot); +EXPORT_SYMBOL(kmap_atomic_to_page); void __init set_highmem_pages_init(void) { diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 0607119cef9..73ffd5536f6 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -28,69 +28,6 @@ int direct_gbpages #endif ; -int nx_enabled; - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) -static int disable_nx __cpuinitdata; - -/* - * noexec = on|off - * - * Control non-executable mappings for processes. - * - * on Enable - * off Disable - */ -static int __init noexec_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } else if (!strncmp(str, "off", 3)) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", noexec_setup); -#endif - -#ifdef CONFIG_X86_PAE -static void __init set_nx(void) -{ - unsigned int v[4], l, h; - - if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { - cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); - - if ((v[3] & (1 << 20)) && !disable_nx) { - rdmsr(MSR_EFER, l, h); - l |= EFER_NX; - wrmsr(MSR_EFER, l, h); - nx_enabled = 1; - __supported_pte_mask |= _PAGE_NX; - } - } -} -#else -static inline void set_nx(void) -{ -} -#endif - -#ifdef CONFIG_X86_64 -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || disable_nx) - __supported_pte_mask &= ~_PAGE_NX; -} -#endif - static void __init find_early_table_space(unsigned long end, int use_pse, int use_gbpages) { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3cd7711bb94..30938c1d8d5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -84,7 +84,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { if (after_bootmem) - pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); else pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); @@ -116,7 +116,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) #endif if (!page_table) page_table = - (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); + (pte_t *)alloc_bootmem_pages(PAGE_SIZE); } else page_table = (pte_t *)alloc_low_page(); @@ -857,8 +857,6 @@ static void __init test_wp_bit(void) } } -static struct kcore_list kcore_mem, kcore_vmalloc; - void __init mem_init(void) { int codesize, reservedpages, datasize, initsize; @@ -886,13 +884,9 @@ void __init mem_init(void) datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, - VMALLOC_END-VMALLOC_START); - printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " "%dk reserved, %dk data, %dk init, %ldk highmem)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), num_physpages << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ea56b8cbb6a..5a4398a6006 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -647,8 +647,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif /* CONFIG_MEMORY_HOTPLUG */ -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, - kcore_modules, kcore_vsyscall; +static struct kcore_list kcore_vsyscall; void __init mem_init(void) { @@ -677,17 +676,12 @@ void __init mem_init(void) initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, - VMALLOC_END-VMALLOC_START); - kclist_add(&kcore_kernel, &_stext, _end - _stext); - kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, - VSYSCALL_END - VSYSCALL_START); + VSYSCALL_END - VSYSCALL_START, KCORE_OTHER); printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), max_pfn << (PAGE_SHIFT-10), codesize >> 10, absent_pages << (PAGE_SHIFT-10), diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index fe6f84ca121..84e236ce76b 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -21,7 +21,7 @@ #include <linux/module.h> #include <linux/highmem.h> -int is_io_mapping_possible(resource_size_t base, unsigned long size) +static int is_io_mapping_possible(resource_size_t base, unsigned long size) { #if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) /* There is no way to map greater than 1 << 32 address without PAE */ @@ -30,7 +30,30 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size) #endif return 1; } -EXPORT_SYMBOL_GPL(is_io_mapping_possible); + +int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) +{ + unsigned long flag = _PAGE_CACHE_WC; + int ret; + + if (!is_io_mapping_possible(base, size)) + return -EINVAL; + + ret = io_reserve_memtype(base, base + size, &flag); + if (ret) + return ret; + + *prot = __pgprot(__PAGE_KERNEL | flag); + return 0; +} +EXPORT_SYMBOL_GPL(iomap_create_wc); + +void +iomap_free(resource_size_t base, unsigned long size) +{ + io_free_memtype(base, base + size); +} +EXPORT_SYMBOL_GPL(iomap_free); void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) { diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 8a450930834..2feb9bdedaa 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -22,77 +22,7 @@ #include <asm/pgalloc.h> #include <asm/pat.h> -static inline int phys_addr_valid(resource_size_t addr) -{ -#ifdef CONFIG_PHYS_ADDR_T_64BIT - return !(addr >> boot_cpu_data.x86_phys_bits); -#else - return 1; -#endif -} - -#ifdef CONFIG_X86_64 - -unsigned long __phys_addr(unsigned long x) -{ - if (x >= __START_KERNEL_map) { - x -= __START_KERNEL_map; - VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); - x += phys_base; - } else { - VIRTUAL_BUG_ON(x < PAGE_OFFSET); - x -= PAGE_OFFSET; - VIRTUAL_BUG_ON(!phys_addr_valid(x)); - } - return x; -} -EXPORT_SYMBOL(__phys_addr); - -bool __virt_addr_valid(unsigned long x) -{ - if (x >= __START_KERNEL_map) { - x -= __START_KERNEL_map; - if (x >= KERNEL_IMAGE_SIZE) - return false; - x += phys_base; - } else { - if (x < PAGE_OFFSET) - return false; - x -= PAGE_OFFSET; - if (!phys_addr_valid(x)) - return false; - } - - return pfn_valid(x >> PAGE_SHIFT); -} -EXPORT_SYMBOL(__virt_addr_valid); - -#else - -#ifdef CONFIG_DEBUG_VIRTUAL -unsigned long __phys_addr(unsigned long x) -{ - /* VMALLOC_* aren't constants */ - VIRTUAL_BUG_ON(x < PAGE_OFFSET); - VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); - return x - PAGE_OFFSET; -} -EXPORT_SYMBOL(__phys_addr); -#endif - -bool __virt_addr_valid(unsigned long x) -{ - if (x < PAGE_OFFSET) - return false; - if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) - return false; - if (x >= FIXADDR_START) - return false; - return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); -} -EXPORT_SYMBOL(__virt_addr_valid); - -#endif +#include "physaddr.h" int page_is_ram(unsigned long pagenr) { @@ -228,30 +158,19 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, retval = reserve_memtype(phys_addr, (u64)phys_addr + size, prot_val, &new_prot_val); if (retval) { - pr_debug("Warning: reserve_memtype returned %d\n", retval); + printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval); return NULL; } if (prot_val != new_prot_val) { - /* - * Do not fallback to certain memory types with certain - * requested type: - * - request is uc-, return cannot be write-back - * - request is uc-, return cannot be write-combine - * - request is write-combine, return cannot be write-back - */ - if ((prot_val == _PAGE_CACHE_UC_MINUS && - (new_prot_val == _PAGE_CACHE_WB || - new_prot_val == _PAGE_CACHE_WC)) || - (prot_val == _PAGE_CACHE_WC && - new_prot_val == _PAGE_CACHE_WB)) { - pr_debug( + if (!is_new_memtype_allowed(phys_addr, size, + prot_val, new_prot_val)) { + printk(KERN_ERR "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", (unsigned long long)phys_addr, (unsigned long long)(phys_addr + size), prot_val, new_prot_val); - free_memtype(phys_addr, phys_addr + size); - return NULL; + goto err_free_memtype; } prot_val = new_prot_val; } @@ -277,26 +196,25 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, */ area = get_vm_area_caller(size, VM_IOREMAP, caller); if (!area) - return NULL; + goto err_free_memtype; area->phys_addr = phys_addr; vaddr = (unsigned long) area->addr; - if (kernel_map_sync_memtype(phys_addr, size, prot_val)) { - free_memtype(phys_addr, phys_addr + size); - free_vm_area(area); - return NULL; - } + if (kernel_map_sync_memtype(phys_addr, size, prot_val)) + goto err_free_area; - if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { - free_memtype(phys_addr, phys_addr + size); - free_vm_area(area); - return NULL; - } + if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) + goto err_free_area; ret_addr = (void __iomem *) (vaddr + offset); mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); return ret_addr; +err_free_area: + free_vm_area(area); +err_free_memtype: + free_memtype(phys_addr, phys_addr + size); + return NULL; } /** diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index 528bf954eb7..8cc18334414 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -225,9 +225,6 @@ void kmemcheck_hide(struct pt_regs *regs) BUG_ON(!irqs_disabled()); - if (data->balance == 0) - return; - if (unlikely(data->balance != 1)) { kmemcheck_show_all(); kmemcheck_error_save_bug(regs); diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c index e773b6bd007..3f66b82076a 100644 --- a/arch/x86/mm/kmemcheck/shadow.c +++ b/arch/x86/mm/kmemcheck/shadow.c @@ -1,7 +1,6 @@ #include <linux/kmemcheck.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/module.h> #include <asm/page.h> #include <asm/pgtable.h> diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 16ccbd77917..11a4ad4d625 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -540,8 +540,14 @@ kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) struct die_args *arg = args; if (val == DIE_DEBUG && (arg->err & DR_STEP)) - if (post_kmmio_handler(arg->err, arg->regs) == 1) + if (post_kmmio_handler(arg->err, arg->regs) == 1) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP; return NOTIFY_STOP; + } return NOTIFY_DONE; } diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 16582960056..c8191defc38 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -29,13 +29,26 @@ #include <linux/random.h> #include <linux/limits.h> #include <linux/sched.h> +#include <asm/elf.h> + +static unsigned int stack_maxrandom_size(void) +{ + unsigned int max = 0; + if ((current->flags & PF_RANDOMIZE) && + !(current->personality & ADDR_NO_RANDOMIZE)) { + max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT; + } + + return max; +} + /* * Top of mmap area (just below the process stack). * - * Leave an at least ~128 MB hole. + * Leave an at least ~128 MB hole with possible stack randomization. */ -#define MIN_GAP (128*1024*1024) +#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size()) #define MAX_GAP (TASK_SIZE/6*5) /* diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7e600c1962d..dd38bfbefd1 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -12,6 +12,7 @@ #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/pfn.h> +#include <linux/percpu.h> #include <asm/e820.h> #include <asm/processor.h> @@ -143,6 +144,7 @@ void clflush_cache_range(void *vaddr, unsigned int size) mb(); } +EXPORT_SYMBOL_GPL(clflush_cache_range); static void __cpa_flush_all(void *arg) { @@ -686,7 +688,7 @@ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); - unsigned long vaddr, remapped; + unsigned long vaddr; int ret; if (cpa->pfn >= max_pfn_mapped) @@ -744,24 +746,6 @@ static int cpa_process_alias(struct cpa_data *cpa) } #endif - /* - * If the PMD page was partially used for per-cpu remapping, - * the recycled area needs to be split and modified. Because - * the area is always proper subset of a PMD page - * cpa->numpages is guaranteed to be 1 for these areas, so - * there's no need to loop over and check for further remaps. - */ - remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); - if (remapped) { - WARN_ON(cpa->numpages > 1); - alias_cpa = *cpa; - alias_cpa.vaddr = &remapped; - alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - ret = __change_page_attr_set_clr(&alias_cpa, 0); - if (ret) - return ret; - } - return 0; } @@ -822,6 +806,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, { struct cpa_data cpa; int ret, cache, checkalias; + unsigned long baddr = 0; /* * Check, if we are requested to change a not supported @@ -853,6 +838,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, */ WARN_ON_ONCE(1); } + /* + * Save address for cache flush. *addr is modified in the call + * to __change_page_attr_set_clr() below. + */ + baddr = *addr; } /* Must avoid aliasing mappings in the highmem code */ @@ -900,7 +890,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, cpa_flush_array(addr, numpages, cache, cpa.flags, pages); } else - cpa_flush_range(*addr, numpages, cache); + cpa_flush_range(baddr, numpages, cache); } else cpa_flush_all(cache); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index b2f7d3e59b8..e78cd0ec2bc 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -15,6 +15,7 @@ #include <linux/gfp.h> #include <linux/mm.h> #include <linux/fs.h> +#include <linux/rbtree.h> #include <asm/cacheflush.h> #include <asm/processor.h> @@ -80,6 +81,7 @@ enum { void pat_init(void) { u64 pat; + bool boot_cpu = !boot_pat_state; if (!pat_enabled) return; @@ -121,8 +123,10 @@ void pat_init(void) rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); wrmsrl(MSR_IA32_CR_PAT, pat); - printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", - smp_processor_id(), boot_pat_state, pat); + + if (boot_cpu) + printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", + smp_processor_id(), boot_pat_state, pat); } #undef PAT @@ -148,11 +152,10 @@ static char *cattr_name(unsigned long flags) * areas). All the aliases have the same cache attributes of course. * Zero attributes are represented as holes. * - * Currently the data structure is a list because the number of mappings - * are expected to be relatively small. If this should be a problem - * it could be changed to a rbtree or similar. + * The data structure is a list that is also organized as an rbtree + * sorted on the start address of memtype range. * - * memtype_lock protects the whole list. + * memtype_lock protects both the linear list and rbtree. */ struct memtype { @@ -160,11 +163,53 @@ struct memtype { u64 end; unsigned long type; struct list_head nd; + struct rb_node rb; }; +static struct rb_root memtype_rbroot = RB_ROOT; static LIST_HEAD(memtype_list); static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ +static struct memtype *memtype_rb_search(struct rb_root *root, u64 start) +{ + struct rb_node *node = root->rb_node; + struct memtype *last_lower = NULL; + + while (node) { + struct memtype *data = container_of(node, struct memtype, rb); + + if (data->start < start) { + last_lower = data; + node = node->rb_right; + } else if (data->start > start) { + node = node->rb_left; + } else + return data; + } + + /* Will return NULL if there is no entry with its start <= start */ + return last_lower; +} + +static void memtype_rb_insert(struct rb_root *root, struct memtype *data) +{ + struct rb_node **new = &(root->rb_node); + struct rb_node *parent = NULL; + + while (*new) { + struct memtype *this = container_of(*new, struct memtype, rb); + + parent = *new; + if (data->start <= this->start) + new = &((*new)->rb_left); + else if (data->start > this->start) + new = &((*new)->rb_right); + } + + rb_link_node(&data->rb, parent, new); + rb_insert_color(&data->rb, root); +} + /* * Does intersection of PAT memory type and MTRR memory type and returns * the resulting memory type as PAT understands it. @@ -218,9 +263,6 @@ chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) return -EBUSY; } -static struct memtype *cached_entry; -static u64 cached_start; - static int pat_pagerange_is_ram(unsigned long start, unsigned long end) { int ram_page = 0, not_rampage = 0; @@ -249,63 +291,61 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end) } /* - * For RAM pages, mark the pages as non WB memory type using - * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or - * set_memory_wc() on a RAM page at a time before marking it as WB again. - * This is ok, because only one driver will be owning the page and - * doing set_memory_*() calls. + * For RAM pages, we use page flags to mark the pages with appropriate type. + * Here we do two pass: + * - Find the memtype of all the pages in the range, look for any conflicts + * - In case of no conflicts, set the new memtype for pages in the range * - * For now, we use PageNonWB to track that the RAM page is being mapped - * as non WB. In future, we will have to use one more flag - * (or some other mechanism in page_struct) to distinguish between - * UC and WC mapping. + * Caller must hold memtype_lock for atomicity. */ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, unsigned long *new_type) { struct page *page; - u64 pfn, end_pfn; + u64 pfn; + + if (req_type == _PAGE_CACHE_UC) { + /* We do not support strong UC */ + WARN_ON_ONCE(1); + req_type = _PAGE_CACHE_UC_MINUS; + } for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { - page = pfn_to_page(pfn); - if (page_mapped(page) || PageNonWB(page)) - goto out; + unsigned long type; - SetPageNonWB(page); + page = pfn_to_page(pfn); + type = get_page_memtype(page); + if (type != -1) { + printk(KERN_INFO "reserve_ram_pages_type failed " + "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n", + start, end, type, req_type); + if (new_type) + *new_type = type; + + return -EBUSY; + } } - return 0; -out: - end_pfn = pfn; - for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { + if (new_type) + *new_type = req_type; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); - ClearPageNonWB(page); + set_page_memtype(page, req_type); } - - return -EINVAL; + return 0; } static int free_ram_pages_type(u64 start, u64 end) { struct page *page; - u64 pfn, end_pfn; + u64 pfn; for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); - if (page_mapped(page) || !PageNonWB(page)) - goto out; - - ClearPageNonWB(page); + set_page_memtype(page, -1); } return 0; - -out: - end_pfn = pfn; - for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { - page = pfn_to_page(pfn); - SetPageNonWB(page); - } - return -EINVAL; } /* @@ -339,6 +379,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (new_type) { if (req_type == -1) *new_type = _PAGE_CACHE_WB; + else if (req_type == _PAGE_CACHE_WC) + *new_type = _PAGE_CACHE_UC_MINUS; else *new_type = req_type & _PAGE_CACHE_MASK; } @@ -364,11 +406,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, *new_type = actual_type; is_range_ram = pat_pagerange_is_ram(start, end); - if (is_range_ram == 1) - return reserve_ram_pages_type(start, end, req_type, - new_type); - else if (is_range_ram < 0) + if (is_range_ram == 1) { + + spin_lock(&memtype_lock); + err = reserve_ram_pages_type(start, end, req_type, new_type); + spin_unlock(&memtype_lock); + + return err; + } else if (is_range_ram < 0) { return -EINVAL; + } new = kmalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) @@ -380,17 +427,11 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, spin_lock(&memtype_lock); - if (cached_entry && start >= cached_start) - entry = cached_entry; - else - entry = list_entry(&memtype_list, struct memtype, nd); - /* Search for existing mapping that overlaps the current range */ where = NULL; - list_for_each_entry_continue(entry, &memtype_list, nd) { + list_for_each_entry(entry, &memtype_list, nd) { if (end <= entry->start) { where = entry->nd.prev; - cached_entry = list_entry(where, struct memtype, nd); break; } else if (start <= entry->start) { /* end > entry->start */ err = chk_conflict(new, entry, new_type); @@ -398,8 +439,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("Overlap at 0x%Lx-0x%Lx\n", entry->start, entry->end); where = entry->nd.prev; - cached_entry = list_entry(where, - struct memtype, nd); } break; } else if (start < entry->end) { /* start > entry->start */ @@ -407,8 +446,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (!err) { dprintk("Overlap at 0x%Lx-0x%Lx\n", entry->start, entry->end); - cached_entry = list_entry(entry->nd.prev, - struct memtype, nd); /* * Move to right position in the linked @@ -436,13 +473,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, return err; } - cached_start = start; - if (where) list_add(&new->nd, where); else list_add_tail(&new->nd, &memtype_list); + memtype_rb_insert(&memtype_rbroot, new); + spin_unlock(&memtype_lock); dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", @@ -454,7 +491,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, int free_memtype(u64 start, u64 end) { - struct memtype *entry; + struct memtype *entry, *saved_entry; int err = -EINVAL; int is_range_ram; @@ -466,23 +503,58 @@ int free_memtype(u64 start, u64 end) return 0; is_range_ram = pat_pagerange_is_ram(start, end); - if (is_range_ram == 1) - return free_ram_pages_type(start, end); - else if (is_range_ram < 0) + if (is_range_ram == 1) { + + spin_lock(&memtype_lock); + err = free_ram_pages_type(start, end); + spin_unlock(&memtype_lock); + + return err; + } else if (is_range_ram < 0) { return -EINVAL; + } spin_lock(&memtype_lock); - list_for_each_entry(entry, &memtype_list, nd) { + + entry = memtype_rb_search(&memtype_rbroot, start); + if (unlikely(entry == NULL)) + goto unlock_ret; + + /* + * Saved entry points to an entry with start same or less than what + * we searched for. Now go through the list in both directions to look + * for the entry that matches with both start and end, with list stored + * in sorted start address + */ + saved_entry = entry; + list_for_each_entry_from(entry, &memtype_list, nd) { if (entry->start == start && entry->end == end) { - if (cached_entry == entry || cached_start == start) - cached_entry = NULL; + rb_erase(&entry->rb, &memtype_rbroot); + list_del(&entry->nd); + kfree(entry); + err = 0; + break; + } else if (entry->start > start) { + break; + } + } + if (!err) + goto unlock_ret; + + entry = saved_entry; + list_for_each_entry_reverse(entry, &memtype_list, nd) { + if (entry->start == start && entry->end == end) { + rb_erase(&entry->rb, &memtype_rbroot); list_del(&entry->nd); kfree(entry); err = 0; break; + } else if (entry->start < start) { + break; } } +unlock_ret: spin_unlock(&memtype_lock); if (err) { @@ -496,6 +568,101 @@ int free_memtype(u64 start, u64 end) } +/** + * lookup_memtype - Looksup the memory type for a physical address + * @paddr: physical address of which memory type needs to be looked up + * + * Only to be called when PAT is enabled + * + * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or + * _PAGE_CACHE_UC + */ +static unsigned long lookup_memtype(u64 paddr) +{ + int rettype = _PAGE_CACHE_WB; + struct memtype *entry; + + if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) + return rettype; + + if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { + struct page *page; + spin_lock(&memtype_lock); + page = pfn_to_page(paddr >> PAGE_SHIFT); + rettype = get_page_memtype(page); + spin_unlock(&memtype_lock); + /* + * -1 from get_page_memtype() implies RAM page is in its + * default state and not reserved, and hence of type WB + */ + if (rettype == -1) + rettype = _PAGE_CACHE_WB; + + return rettype; + } + + spin_lock(&memtype_lock); + + entry = memtype_rb_search(&memtype_rbroot, paddr); + if (entry != NULL) + rettype = entry->type; + else + rettype = _PAGE_CACHE_UC_MINUS; + + spin_unlock(&memtype_lock); + return rettype; +} + +/** + * io_reserve_memtype - Request a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + * @type: A pointer to memtype, with requested type. On success, requested + * or any other compatible type that was available for the region is returned + * + * On success, returns 0 + * On failure, returns non-zero + */ +int io_reserve_memtype(resource_size_t start, resource_size_t end, + unsigned long *type) +{ + resource_size_t size = end - start; + unsigned long req_type = *type; + unsigned long new_type; + int ret; + + WARN_ON_ONCE(iomem_map_sanity_check(start, size)); + + ret = reserve_memtype(start, end, req_type, &new_type); + if (ret) + goto out_err; + + if (!is_new_memtype_allowed(start, size, req_type, new_type)) + goto out_free; + + if (kernel_map_sync_memtype(start, size, new_type) < 0) + goto out_free; + + *type = new_type; + return 0; + +out_free: + free_memtype(start, end); + ret = -EBUSY; +out_err: + return ret; +} + +/** + * io_free_memtype - Release a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + */ +void io_free_memtype(resource_size_t start, resource_size_t end) +{ + free_memtype(start, end); +} + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { @@ -577,7 +744,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) { unsigned long id_sz; - if (!pat_enabled || base >= __pa(high_memory)) + if (base >= __pa(high_memory)) return 0; id_sz = (__pa(high_memory) < base + size) ? @@ -612,11 +779,29 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, is_ram = pat_pagerange_is_ram(paddr, paddr + size); /* - * reserve_pfn_range() doesn't support RAM pages. Maintain the current - * behavior with RAM pages by returning success. + * reserve_pfn_range() for RAM pages. We do not refcount to keep + * track of number of mappings of RAM pages. We can assert that + * the type requested matches the type of first page in the range. */ - if (is_ram != 0) + if (is_ram) { + if (!pat_enabled) + return 0; + + flags = lookup_memtype(paddr); + if (want_flags != flags) { + printk(KERN_WARNING + "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size), + cattr_name(flags)); + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + flags); + } return 0; + } ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); if (ret) @@ -678,14 +863,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; - if (!pat_enabled) - return 0; - - /* - * For now, only handle remap_pfn_range() vmas where - * is_linear_pfn_mapping() == TRUE. Handling of - * vm_insert_pfn() is TBD. - */ if (is_linear_pfn_mapping(vma)) { /* * reserve the whole chunk covered by vma. We need the @@ -713,23 +890,24 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long size) { + unsigned long flags; resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; - if (!pat_enabled) - return 0; - - /* - * For now, only handle remap_pfn_range() vmas where - * is_linear_pfn_mapping() == TRUE. Handling of - * vm_insert_pfn() is TBD. - */ if (is_linear_pfn_mapping(vma)) { /* reserve the whole chunk starting from vm_pgoff */ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; return reserve_pfn_range(paddr, vma_size, prot, 0); } + if (!pat_enabled) + return 0; + + /* for vm_insert_pfn and friends, we set prot based on lookup */ + flags = lookup_memtype(pfn << PAGE_SHIFT); + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + flags); + return 0; } @@ -744,14 +922,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; - if (!pat_enabled) - return; - - /* - * For now, only handle remap_pfn_range() vmas where - * is_linear_pfn_mapping() == TRUE. Handling of - * vm_insert_pfn() is TBD. - */ if (is_linear_pfn_mapping(vma)) { /* free the whole chunk starting from vm_pgoff */ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c new file mode 100644 index 00000000000..d2e2735327b --- /dev/null +++ b/arch/x86/mm/physaddr.c @@ -0,0 +1,70 @@ +#include <linux/mmdebug.h> +#include <linux/module.h> +#include <linux/mm.h> + +#include <asm/page.h> + +#include "physaddr.h" + +#ifdef CONFIG_X86_64 + +unsigned long __phys_addr(unsigned long x) +{ + if (x >= __START_KERNEL_map) { + x -= __START_KERNEL_map; + VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); + x += phys_base; + } else { + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + x -= PAGE_OFFSET; + VIRTUAL_BUG_ON(!phys_addr_valid(x)); + } + return x; +} +EXPORT_SYMBOL(__phys_addr); + +bool __virt_addr_valid(unsigned long x) +{ + if (x >= __START_KERNEL_map) { + x -= __START_KERNEL_map; + if (x >= KERNEL_IMAGE_SIZE) + return false; + x += phys_base; + } else { + if (x < PAGE_OFFSET) + return false; + x -= PAGE_OFFSET; + if (!phys_addr_valid(x)) + return false; + } + + return pfn_valid(x >> PAGE_SHIFT); +} +EXPORT_SYMBOL(__virt_addr_valid); + +#else + +#ifdef CONFIG_DEBUG_VIRTUAL +unsigned long __phys_addr(unsigned long x) +{ + /* VMALLOC_* aren't constants */ + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); + return x - PAGE_OFFSET; +} +EXPORT_SYMBOL(__phys_addr); +#endif + +bool __virt_addr_valid(unsigned long x) +{ + if (x < PAGE_OFFSET) + return false; + if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) + return false; + if (x >= FIXADDR_START) + return false; + return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); +} +EXPORT_SYMBOL(__virt_addr_valid); + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h new file mode 100644 index 00000000000..a3cd5a0c97b --- /dev/null +++ b/arch/x86/mm/physaddr.h @@ -0,0 +1,10 @@ +#include <asm/processor.h> + +static inline int phys_addr_valid(resource_size_t addr) +{ +#ifdef CONFIG_PHYS_ADDR_T_64BIT + return !(addr >> boot_cpu_data.x86_phys_bits); +#else + return 1; +#endif +} diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c new file mode 100644 index 00000000000..513d8ed5d2e --- /dev/null +++ b/arch/x86/mm/setup_nx.c @@ -0,0 +1,69 @@ +#include <linux/spinlock.h> +#include <linux/errno.h> +#include <linux/init.h> + +#include <asm/pgtable.h> + +int nx_enabled; + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +static int disable_nx __cpuinitdata; + +/* + * noexec = on|off + * + * Control non-executable mappings for processes. + * + * on Enable + * off Disable + */ +static int __init noexec_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + disable_nx = 0; + } else if (!strncmp(str, "off", 3)) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; +} +early_param("noexec", noexec_setup); +#endif + +#ifdef CONFIG_X86_PAE +void __init set_nx(void) +{ + unsigned int v[4], l, h; + + if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { + cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); + + if ((v[3] & (1 << 20)) && !disable_nx) { + rdmsr(MSR_EFER, l, h); + l |= EFER_NX; + wrmsr(MSR_EFER, l, h); + nx_enabled = 1; + __supported_pte_mask |= _PAGE_NX; + } + } +} +#else +void set_nx(void) +{ +} +#endif + +#ifdef CONFIG_X86_64 +void __cpuinit check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || disable_nx) + __supported_pte_mask &= ~_PAGE_NX; +} +#endif + diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index dbb5381f7b3..9d7ce96e5a5 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -136,7 +136,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) apicid_to_node[apic_id] = node; node_set(node, cpu_nodes_parsed); acpi_numa = 1; - printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node); } @@ -170,7 +170,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) apicid_to_node[apic_id] = node; node_set(node, cpu_nodes_parsed); acpi_numa = 1; - printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node); } diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index 427fd1b56df..8565d944f7c 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -1,12 +1,13 @@ /* * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi> */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/io.h> #include <linux/mmiotrace.h> -#define MODULE_NAME "testmmiotrace" - static unsigned long mmio_address; module_param(mmio_address, ulong, 0); MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " @@ -30,7 +31,7 @@ static unsigned v32(unsigned i) static void do_write_test(void __iomem *p) { unsigned int i; - pr_info(MODULE_NAME ": write test.\n"); + pr_info("write test.\n"); mmiotrace_printk("Write test.\n"); for (i = 0; i < 256; i++) @@ -47,7 +48,7 @@ static void do_read_test(void __iomem *p) { unsigned int i; unsigned errs[3] = { 0 }; - pr_info(MODULE_NAME ": read test.\n"); + pr_info("read test.\n"); mmiotrace_printk("Read test.\n"); for (i = 0; i < 256; i++) @@ -68,7 +69,7 @@ static void do_read_test(void __iomem *p) static void do_read_far_test(void __iomem *p) { - pr_info(MODULE_NAME ": read far test.\n"); + pr_info("read far test.\n"); mmiotrace_printk("Read far test.\n"); ioread32(p + read_far); @@ -78,7 +79,7 @@ static void do_test(unsigned long size) { void __iomem *p = ioremap_nocache(mmio_address, size); if (!p) { - pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); + pr_err("could not ioremap, aborting.\n"); return; } mmiotrace_printk("ioremap returned %p.\n", p); @@ -94,24 +95,22 @@ static int __init init(void) unsigned long size = (read_far) ? (8 << 20) : (16 << 10); if (mmio_address == 0) { - pr_err(MODULE_NAME ": you have to use the module argument " - "mmio_address.\n"); - pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS" - " YOU REALLY KNOW WHAT YOU ARE DOING!\n"); + pr_err("you have to use the module argument mmio_address.\n"); + pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n"); return -ENXIO; } - pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI " - "address space, and writing 16 kB of rubbish in there.\n", - size >> 10, mmio_address); + pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, " + "and writing 16 kB of rubbish in there.\n", + size >> 10, mmio_address); do_test(size); - pr_info(MODULE_NAME ": All done.\n"); + pr_info("All done.\n"); return 0; } static void __exit cleanup(void) { - pr_debug(MODULE_NAME ": unloaded.\n"); + pr_debug("unloaded.\n"); } module_init(init); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c814e144a3f..36fe08eeb5c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -59,7 +59,8 @@ void leave_mm(int cpu) { if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) BUG(); - cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); + cpumask_clear_cpu(cpu, + mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); load_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); @@ -234,8 +235,8 @@ void flush_tlb_current_task(void) preempt_disable(); local_flush_tlb(); - if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); preempt_enable(); } @@ -249,8 +250,8 @@ void flush_tlb_mm(struct mm_struct *mm) else leave_mm(smp_processor_id()); } - if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); preempt_enable(); } @@ -268,8 +269,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) leave_mm(smp_processor_id()); } - if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(&mm->cpu_vm_mask, mm, va); + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, va); preempt_enable(); } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 4899215999d..8eb05878554 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -234,11 +234,11 @@ static void arch_perfmon_setup_counters(void) if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) { eax.split.version_id = 2; - eax.split.num_counters = 2; + eax.split.num_events = 2; eax.split.bit_width = 40; } - num_counters = eax.split.num_counters; + num_counters = eax.split.num_events; op_arch_perfmon_spec.num_counters = num_counters; op_arch_perfmon_spec.num_controls = num_counters; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index b83776180c7..7b8e75d1608 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -13,7 +13,7 @@ #define OP_X86_MODEL_H #include <asm/types.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h> struct op_msr { unsigned long addr; diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 3ffa10df20b..572ee9782f2 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -15,63 +15,6 @@ * also get peer root bus resource for io,mmio */ -#ifdef CONFIG_NUMA - -#define BUS_NR 256 - -#ifdef CONFIG_X86_64 - -static int mp_bus_to_node[BUS_NR]; - -void set_mp_bus_to_node(int busnum, int node) -{ - if (busnum >= 0 && busnum < BUS_NR) - mp_bus_to_node[busnum] = node; -} - -int get_mp_bus_to_node(int busnum) -{ - int node = -1; - - if (busnum < 0 || busnum > (BUS_NR - 1)) - return node; - - node = mp_bus_to_node[busnum]; - - /* - * let numa_node_id to decide it later in dma_alloc_pages - * if there is no ram on that node - */ - if (node != -1 && !node_online(node)) - node = -1; - - return node; -} - -#else /* CONFIG_X86_32 */ - -static unsigned char mp_bus_to_node[BUS_NR]; - -void set_mp_bus_to_node(int busnum, int node) -{ - if (busnum >= 0 && busnum < BUS_NR) - mp_bus_to_node[busnum] = (unsigned char) node; -} - -int get_mp_bus_to_node(int busnum) -{ - int node; - - if (busnum < 0 || busnum > (BUS_NR - 1)) - return 0; - node = mp_bus_to_node[busnum]; - return node; -} - -#endif /* CONFIG_X86_32 */ - -#endif /* CONFIG_NUMA */ - #ifdef CONFIG_X86_64 /* @@ -301,11 +244,6 @@ static int __init early_fill_mp_bus_info(void) u64 val; u32 address; -#ifdef CONFIG_NUMA - for (i = 0; i < BUS_NR; i++) - mp_bus_to_node[i] = -1; -#endif - if (!early_pci_allowed()) return -1; @@ -346,7 +284,7 @@ static int __init early_fill_mp_bus_info(void) node = (reg >> 4) & 0x07; #ifdef CONFIG_NUMA for (j = min_bus; j <= max_bus; j++) - mp_bus_to_node[j] = (unsigned char) node; + set_mp_bus_to_node(j, node); #endif link = (reg >> 8) & 0x03; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 2202b6257b8..1331fcf2614 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -600,3 +600,72 @@ struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno) { return pci_scan_bus_on_node(busno, &pci_root_ops, -1); } + +/* + * NUMA info for PCI busses + * + * Early arch code is responsible for filling in reasonable values here. + * A node id of "-1" means "use current node". In other words, if a bus + * has a -1 node id, it's not tightly coupled to any particular chunk + * of memory (as is the case on some Nehalem systems). + */ +#ifdef CONFIG_NUMA + +#define BUS_NR 256 + +#ifdef CONFIG_X86_64 + +static int mp_bus_to_node[BUS_NR] = { + [0 ... BUS_NR - 1] = -1 +}; + +void set_mp_bus_to_node(int busnum, int node) +{ + if (busnum >= 0 && busnum < BUS_NR) + mp_bus_to_node[busnum] = node; +} + +int get_mp_bus_to_node(int busnum) +{ + int node = -1; + + if (busnum < 0 || busnum > (BUS_NR - 1)) + return node; + + node = mp_bus_to_node[busnum]; + + /* + * let numa_node_id to decide it later in dma_alloc_pages + * if there is no ram on that node + */ + if (node != -1 && !node_online(node)) + node = -1; + + return node; +} + +#else /* CONFIG_X86_32 */ + +static int mp_bus_to_node[BUS_NR] = { + [0 ... BUS_NR - 1] = -1 +}; + +void set_mp_bus_to_node(int busnum, int node) +{ + if (busnum >= 0 && busnum < BUS_NR) + mp_bus_to_node[busnum] = (unsigned char) node; +} + +int get_mp_bus_to_node(int busnum) +{ + int node; + + if (busnum < 0 || busnum > (BUS_NR - 1)) + return 0; + node = mp_bus_to_node[busnum]; + return node; +} + +#endif /* CONFIG_X86_32 */ + +#endif /* CONFIG_NUMA */ diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 52e62e57fed..b22d13b0c71 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -266,7 +266,7 @@ void pcibios_set_master(struct pci_dev *dev) pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); } -static struct vm_operations_struct pci_mmap_ops = { +static const struct vm_operations_struct pci_mmap_ops = { .access = generic_access_phys, }; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 712443ec6d4..602c172d3bd 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -13,10 +13,14 @@ #include <linux/pci.h> #include <linux/init.h> #include <linux/acpi.h> +#include <linux/sfi_acpi.h> #include <linux/bitmap.h> #include <linux/sort.h> #include <asm/e820.h> #include <asm/pci_x86.h> +#include <asm/acpi.h> + +#define PREFIX "PCI: " /* aperture is up to 256MB but BIOS may reserve less */ #define MMCONFIG_APER_MIN (2 * 1024*1024) @@ -491,7 +495,7 @@ static void __init pci_mmcfg_reject_broken(int early) (unsigned int)cfg->start_bus_number, (unsigned int)cfg->end_bus_number); - if (!early) + if (!early && !acpi_disabled) valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); if (valid) @@ -606,7 +610,7 @@ static void __init __pci_mmcfg_init(int early) } if (!known_bridge) - acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); + acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); pci_mmcfg_reject_broken(early); diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index 8b2d561046a..f10a7e94a84 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -11,9 +11,9 @@ #include <linux/pci.h> #include <linux/init.h> -#include <linux/acpi.h> #include <asm/e820.h> #include <asm/pci_x86.h> +#include <acpi/acpi.h> /* Assume systems with more busses have correct MCFG */ #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index b3d20b9cac6..0a979f3e5b8 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -18,6 +18,7 @@ #include <asm/mce.h> #include <asm/xcr.h> #include <asm/suspend.h> +#include <asm/debugreg.h> #ifdef CONFIG_X86_32 static struct saved_context saved_context; @@ -142,31 +143,6 @@ static void fix_processor_context(void) #endif load_TR_desc(); /* This does ltr */ load_LDT(¤t->active_mm->context); /* This does lldt */ - - /* - * Now maybe reload the debug registers - */ - if (current->thread.debugreg7) { -#ifdef CONFIG_X86_32 - set_debugreg(current->thread.debugreg0, 0); - set_debugreg(current->thread.debugreg1, 1); - set_debugreg(current->thread.debugreg2, 2); - set_debugreg(current->thread.debugreg3, 3); - /* no 4 and 5 */ - set_debugreg(current->thread.debugreg6, 6); - set_debugreg(current->thread.debugreg7, 7); -#else - /* CONFIG_X86_64 */ - loaddebug(¤t->thread, 0); - loaddebug(¤t->thread, 1); - loaddebug(¤t->thread, 2); - loaddebug(¤t->thread, 3); - /* no 4 and 5 */ - loaddebug(¤t->thread, 6); - loaddebug(¤t->thread, 7); -#endif - } - } /** @@ -242,11 +218,7 @@ static void __restore_processor_state(struct saved_context *ctxt) fix_processor_context(); do_fpu_end(); - mtrr_ap_init(); - -#ifdef CONFIG_X86_OLD_MCE - mcheck_init(&boot_cpu_data); -#endif + mtrr_bp_restore(); } /* Needed by apm.c */ diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile new file mode 100644 index 00000000000..f8208267733 --- /dev/null +++ b/arch/x86/tools/Makefile @@ -0,0 +1,31 @@ +PHONY += posttest + +ifeq ($(KBUILD_VERBOSE),1) + posttest_verbose = -v +else + posttest_verbose = +endif + +ifeq ($(CONFIG_64BIT),y) + posttest_64bit = -y +else + posttest_64bit = -n +endif + +distill_awk = $(srctree)/arch/x86/tools/distill.awk +chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk + +quiet_cmd_posttest = TEST $@ + cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose) + +posttest: $(obj)/test_get_len vmlinux + $(call cmd,posttest) + +hostprogs-y := test_get_len + +# -I needed for generated C source and C source which in the kernel tree. +HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ + +# Dependencies are also needed. +$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c + diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk new file mode 100644 index 00000000000..0d13cd9fdcf --- /dev/null +++ b/arch/x86/tools/chkobjdump.awk @@ -0,0 +1,23 @@ +# GNU objdump version checker +# +# Usage: +# objdump -v | awk -f chkobjdump.awk +BEGIN { + # objdump version 2.19 or later is OK for the test. + od_ver = 2; + od_sver = 19; +} + +/^GNU/ { + split($4, ver, "."); + if (ver[1] > od_ver || + (ver[1] == od_ver && ver[2] >= od_sver)) { + exit 1; + } else { + printf("Warning: objdump version %s is older than %d.%d\n", + $4, od_ver, od_sver); + print("Warning: Skipping posttest."); + # Logic is inverted, because we just skip test without error. + exit 0; + } +} diff --git a/arch/x86/tools/distill.awk b/arch/x86/tools/distill.awk new file mode 100644 index 00000000000..c13c0ee48ab --- /dev/null +++ b/arch/x86/tools/distill.awk @@ -0,0 +1,47 @@ +#!/bin/awk -f +# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len +# Distills the disassembly as follows: +# - Removes all lines except the disassembled instructions. +# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes +# into a single line. +# - Remove bad(or prefix only) instructions + +BEGIN { + prev_addr = "" + prev_hex = "" + prev_mnemonic = "" + bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))" + fwait_expr = "^9b " + fwait_str="9b\tfwait" +} + +/^ *[0-9a-f]+ <[^>]*>:/ { + # Symbol entry + printf("%s%s\n", $2, $1) +} + +/^ *[0-9a-f]+:/ { + if (split($0, field, "\t") < 3) { + # This is a continuation of the same insn. + prev_hex = prev_hex field[2] + } else { + # Skip bad instructions + if (match(prev_mnemonic, bad_expr)) + prev_addr = "" + # Split fwait from other f* instructions + if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") { + printf "%s\t%s\n", prev_addr, fwait_str + sub(fwait_expr, "", prev_hex) + } + if (prev_addr != "") + printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic + prev_addr = field[1] + prev_hex = field[2] + prev_mnemonic = field[3] + } +} + +END { + if (prev_addr != "") + printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic +} diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk new file mode 100644 index 00000000000..e34e92a28eb --- /dev/null +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -0,0 +1,380 @@ +#!/bin/awk -f +# gen-insn-attr-x86.awk: Instruction attribute table generator +# Written by Masami Hiramatsu <mhiramat@redhat.com> +# +# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c + +# Awk implementation sanity check +function check_awk_implement() { + if (!match("abc", "[[:lower:]]+")) + return "Your awk doesn't support charactor-class." + if (sprintf("%x", 0) != "0") + return "Your awk has a printf-format problem." + return "" +} + +# Clear working vars +function clear_vars() { + delete table + delete lptable2 + delete lptable1 + delete lptable3 + eid = -1 # escape id + gid = -1 # group id + aid = -1 # AVX id + tname = "" +} + +BEGIN { + # Implementation error checking + awkchecked = check_awk_implement() + if (awkchecked != "") { + print "Error: " awkchecked > "/dev/stderr" + print "Please try to use gawk." > "/dev/stderr" + exit 1 + } + + # Setup generating tables + print "/* x86 opcode map generated from x86-opcode-map.txt */" + print "/* Do not change this code. */\n" + ggid = 1 + geid = 1 + gaid = 0 + delete etable + delete gtable + delete atable + + opnd_expr = "^[[:alpha:]/]" + ext_expr = "^\\(" + sep_expr = "^\\|$" + group_expr = "^Grp[[:alnum:]]+" + + imm_expr = "^[IJAO][[:lower:]]" + imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)" + imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)" + imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)" + imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)" + imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)" + imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)" + imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)" + imm_flag["Ob"] = "INAT_MOFFSET" + imm_flag["Ov"] = "INAT_MOFFSET" + + modrm_expr = "^([CDEGMNPQRSUVW/][[:lower:]]+|NTA|T[012])" + force64_expr = "\\([df]64\\)" + rex_expr = "^REX(\\.[XRWB]+)*" + fpu_expr = "^ESC" # TODO + + lprefix1_expr = "\\(66\\)" + lprefix2_expr = "\\(F3\\)" + lprefix3_expr = "\\(F2\\)" + max_lprefix = 4 + + vexok_expr = "\\(VEX\\)" + vexonly_expr = "\\(oVEX\\)" + + prefix_expr = "\\(Prefix\\)" + prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" + prefix_num["REPNE"] = "INAT_PFX_REPNE" + prefix_num["REP/REPE"] = "INAT_PFX_REPE" + prefix_num["LOCK"] = "INAT_PFX_LOCK" + prefix_num["SEG=CS"] = "INAT_PFX_CS" + prefix_num["SEG=DS"] = "INAT_PFX_DS" + prefix_num["SEG=ES"] = "INAT_PFX_ES" + prefix_num["SEG=FS"] = "INAT_PFX_FS" + prefix_num["SEG=GS"] = "INAT_PFX_GS" + prefix_num["SEG=SS"] = "INAT_PFX_SS" + prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ" + prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2" + prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3" + + clear_vars() +} + +function semantic_error(msg) { + print "Semantic error at " NR ": " msg > "/dev/stderr" + exit 1 +} + +function debug(msg) { + print "DEBUG: " msg +} + +function array_size(arr, i,c) { + c = 0 + for (i in arr) + c++ + return c +} + +/^Table:/ { + print "/* " $0 " */" + if (tname != "") + semantic_error("Hit Table: before EndTable:."); +} + +/^Referrer:/ { + if (NF != 1) { + # escape opcode table + ref = "" + for (i = 2; i <= NF; i++) + ref = ref $i + eid = escape[ref] + tname = sprintf("inat_escape_table_%d", eid) + } +} + +/^AVXcode:/ { + if (NF != 1) { + # AVX/escape opcode table + aid = $2 + if (gaid <= aid) + gaid = aid + 1 + if (tname == "") # AVX only opcode table + tname = sprintf("inat_avx_table_%d", $2) + } + if (aid == -1 && eid == -1) # primary opcode table + tname = "inat_primary_table" +} + +/^GrpTable:/ { + print "/* " $0 " */" + if (!($2 in group)) + semantic_error("No group: " $2 ) + gid = group[$2] + tname = "inat_group_table_" gid +} + +function print_table(tbl,name,fmt,n) +{ + print "const insn_attr_t " name " = {" + for (i = 0; i < n; i++) { + id = sprintf(fmt, i) + if (tbl[id]) + print " [" id "] = " tbl[id] "," + } + print "};" +} + +/^EndTable/ { + if (gid != -1) { + # print group tables + if (array_size(table) != 0) { + print_table(table, tname "[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,0] = tname + } + if (array_size(lptable1) != 0) { + print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,1] = tname "_1" + } + if (array_size(lptable2) != 0) { + print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,2] = tname "_2" + } + if (array_size(lptable3) != 0) { + print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,3] = tname "_3" + } + } else { + # print primary/escaped tables + if (array_size(table) != 0) { + print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,0] = tname + if (aid >= 0) + atable[aid,0] = tname + } + if (array_size(lptable1) != 0) { + print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,1] = tname "_1" + if (aid >= 0) + atable[aid,1] = tname "_1" + } + if (array_size(lptable2) != 0) { + print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,2] = tname "_2" + if (aid >= 0) + atable[aid,2] = tname "_2" + } + if (array_size(lptable3) != 0) { + print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,3] = tname "_3" + if (aid >= 0) + atable[aid,3] = tname "_3" + } + } + print "" + clear_vars() +} + +function add_flags(old,new) { + if (old && new) + return old " | " new + else if (old) + return old + else + return new +} + +# convert operands to flags. +function convert_operands(opnd, i,imm,mod) +{ + imm = null + mod = null + for (i in opnd) { + i = opnd[i] + if (match(i, imm_expr) == 1) { + if (!imm_flag[i]) + semantic_error("Unknown imm opnd: " i) + if (imm) { + if (i != "Ib") + semantic_error("Second IMM error") + imm = add_flags(imm, "INAT_SCNDIMM") + } else + imm = imm_flag[i] + } else if (match(i, modrm_expr)) + mod = "INAT_MODRM" + } + return add_flags(imm, mod) +} + +/^[0-9a-f]+\:/ { + if (NR == 1) + next + # get index + idx = "0x" substr($1, 1, index($1,":") - 1) + if (idx in table) + semantic_error("Redefine " idx " in " tname) + + # check if escaped opcode + if ("escape" == $2) { + if ($3 != "#") + semantic_error("No escaped name") + ref = "" + for (i = 4; i <= NF; i++) + ref = ref $i + if (ref in escape) + semantic_error("Redefine escape (" ref ")") + escape[ref] = geid + geid++ + table[idx] = "INAT_MAKE_ESCAPE(" escape[ref] ")" + next + } + + variant = null + # converts + i = 2 + while (i <= NF) { + opcode = $(i++) + delete opnds + ext = null + flags = null + opnd = null + # parse one opcode + if (match($i, opnd_expr)) { + opnd = $i + split($(i++), opnds, ",") + flags = convert_operands(opnds) + } + if (match($i, ext_expr)) + ext = $(i++) + if (match($i, sep_expr)) + i++ + else if (i < NF) + semantic_error($i " is not a separator") + + # check if group opcode + if (match(opcode, group_expr)) { + if (!(opcode in group)) { + group[opcode] = ggid + ggid++ + } + flags = add_flags(flags, "INAT_MAKE_GROUP(" group[opcode] ")") + } + # check force(or default) 64bit + if (match(ext, force64_expr)) + flags = add_flags(flags, "INAT_FORCE64") + + # check REX prefix + if (match(opcode, rex_expr)) + flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)") + + # check coprocessor escape : TODO + if (match(opcode, fpu_expr)) + flags = add_flags(flags, "INAT_MODRM") + + # check VEX only code + if (match(ext, vexonly_expr)) + flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") + + # check VEX only code + if (match(ext, vexok_expr)) + flags = add_flags(flags, "INAT_VEXOK") + + # check prefixes + if (match(ext, prefix_expr)) { + if (!prefix_num[opcode]) + semantic_error("Unknown prefix: " opcode) + flags = add_flags(flags, "INAT_MAKE_PREFIX(" prefix_num[opcode] ")") + } + if (length(flags) == 0) + continue + # check if last prefix + if (match(ext, lprefix1_expr)) { + lptable1[idx] = add_flags(lptable1[idx],flags) + variant = "INAT_VARIANT" + } else if (match(ext, lprefix2_expr)) { + lptable2[idx] = add_flags(lptable2[idx],flags) + variant = "INAT_VARIANT" + } else if (match(ext, lprefix3_expr)) { + lptable3[idx] = add_flags(lptable3[idx],flags) + variant = "INAT_VARIANT" + } else { + table[idx] = add_flags(table[idx],flags) + } + } + if (variant) + table[idx] = add_flags(table[idx],variant) +} + +END { + if (awkchecked != "") + exit 1 + # print escape opcode map's array + print "/* Escape opcode map array */" + print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \ + "[INAT_LSTPFX_MAX + 1] = {" + for (i = 0; i < geid; i++) + for (j = 0; j < max_lprefix; j++) + if (etable[i,j]) + print " ["i"]["j"] = "etable[i,j]"," + print "};\n" + # print group opcode map's array + print "/* Group opcode map array */" + print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\ + "[INAT_LSTPFX_MAX + 1] = {" + for (i = 0; i < ggid; i++) + for (j = 0; j < max_lprefix; j++) + if (gtable[i,j]) + print " ["i"]["j"] = "gtable[i,j]"," + print "};\n" + # print AVX opcode map's array + print "/* AVX opcode map array */" + print "const insn_attr_t const *inat_avx_tables[X86_VEX_M_MAX + 1]"\ + "[INAT_LSTPFX_MAX + 1] = {" + for (i = 0; i < gaid; i++) + for (j = 0; j < max_lprefix; j++) + if (atable[i,j]) + print " ["i"]["j"] = "atable[i,j]"," + print "};" +} + diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c new file mode 100644 index 00000000000..d8214dc03fa --- /dev/null +++ b/arch/x86/tools/test_get_len.c @@ -0,0 +1,173 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include <unistd.h> + +#define unlikely(cond) (cond) + +#include <asm/insn.h> +#include <inat.c> +#include <insn.c> + +/* + * Test of instruction analysis in general and insn_get_length() in + * particular. See if insn_get_length() and the disassembler agree + * on the length of each instruction in an elf disassembly. + * + * Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len + */ + +const char *prog; +static int verbose; +static int x86_64; + +static void usage(void) +{ + fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |" + " %s [-y|-n] [-v] \n", prog); + fprintf(stderr, "\t-y 64bit mode\n"); + fprintf(stderr, "\t-n 32bit mode\n"); + fprintf(stderr, "\t-v verbose mode\n"); + exit(1); +} + +static void malformed_line(const char *line, int line_nr) +{ + fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line); + exit(3); +} + +static void dump_field(FILE *fp, const char *name, const char *indent, + struct insn_field *field) +{ + fprintf(fp, "%s.%s = {\n", indent, name); + fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n", + indent, field->value, field->bytes[0], field->bytes[1], + field->bytes[2], field->bytes[3]); + fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent, + field->got, field->nbytes); +} + +static void dump_insn(FILE *fp, struct insn *insn) +{ + fprintf(fp, "Instruction = { \n"); + dump_field(fp, "prefixes", "\t", &insn->prefixes); + dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix); + dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix); + dump_field(fp, "opcode", "\t", &insn->opcode); + dump_field(fp, "modrm", "\t", &insn->modrm); + dump_field(fp, "sib", "\t", &insn->sib); + dump_field(fp, "displacement", "\t", &insn->displacement); + dump_field(fp, "immediate1", "\t", &insn->immediate1); + dump_field(fp, "immediate2", "\t", &insn->immediate2); + fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n", + insn->attr, insn->opnd_bytes, insn->addr_bytes); + fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n", + insn->length, insn->x86_64, insn->kaddr); +} + +static void parse_args(int argc, char **argv) +{ + int c; + prog = argv[0]; + while ((c = getopt(argc, argv, "ynv")) != -1) { + switch (c) { + case 'y': + x86_64 = 1; + break; + case 'n': + x86_64 = 0; + break; + case 'v': + verbose = 1; + break; + default: + usage(); + } + } +} + +#define BUFSIZE 256 + +int main(int argc, char **argv) +{ + char line[BUFSIZE], sym[BUFSIZE] = "<unknown>"; + unsigned char insn_buf[16]; + struct insn insn; + int insns = 0, c; + int warnings = 0; + + parse_args(argc, argv); + + while (fgets(line, BUFSIZE, stdin)) { + char copy[BUFSIZE], *s, *tab1, *tab2; + int nb = 0; + unsigned int b; + + if (line[0] == '<') { + /* Symbol line */ + strcpy(sym, line); + continue; + } + + insns++; + memset(insn_buf, 0, 16); + strcpy(copy, line); + tab1 = strchr(copy, '\t'); + if (!tab1) + malformed_line(line, insns); + s = tab1 + 1; + s += strspn(s, " "); + tab2 = strchr(s, '\t'); + if (!tab2) + malformed_line(line, insns); + *tab2 = '\0'; /* Characters beyond tab2 aren't examined */ + while (s < tab2) { + if (sscanf(s, "%x", &b) == 1) { + insn_buf[nb++] = (unsigned char) b; + s += 3; + } else + break; + } + /* Decode an instruction */ + insn_init(&insn, insn_buf, x86_64); + insn_get_length(&insn); + if (insn.length != nb) { + warnings++; + fprintf(stderr, "Warning: %s found difference at %s\n", + prog, sym); + fprintf(stderr, "Warning: %s", line); + fprintf(stderr, "Warning: objdump says %d bytes, but " + "insn_get_length() says %d\n", nb, + insn.length); + if (verbose) + dump_insn(stderr, &insn); + } + } + if (warnings) + fprintf(stderr, "Warning: decoded and checked %d" + " instructions with %d warnings\n", insns, warnings); + else + fprintf(stderr, "Succeed: decoded and checked %d" + " instructions\n", insns); + return 0; +} diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 88112b49f02..6b4ffedb93c 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -122,7 +122,7 @@ quiet_cmd_vdso = VDSO $@ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) -VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv) +VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) GCOV_PROFILE := n # diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 6a40b78b46a..ee55754cc3c 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -86,14 +86,47 @@ notrace static noinline int do_monotonic(struct timespec *ts) return 0; } +notrace static noinline int do_realtime_coarse(struct timespec *ts) +{ + unsigned long seq; + do { + seq = read_seqbegin(>od->lock); + ts->tv_sec = gtod->wall_time_coarse.tv_sec; + ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; + } while (unlikely(read_seqretry(>od->lock, seq))); + return 0; +} + +notrace static noinline int do_monotonic_coarse(struct timespec *ts) +{ + unsigned long seq, ns, secs; + do { + seq = read_seqbegin(>od->lock); + secs = gtod->wall_time_coarse.tv_sec; + ns = gtod->wall_time_coarse.tv_nsec; + secs += gtod->wall_to_monotonic.tv_sec; + ns += gtod->wall_to_monotonic.tv_nsec; + } while (unlikely(read_seqretry(>od->lock, seq))); + vset_normalized_timespec(ts, secs, ns); + return 0; +} + notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { - if (likely(gtod->sysctl_enabled && gtod->clock.vread)) + if (likely(gtod->sysctl_enabled)) switch (clock) { case CLOCK_REALTIME: - return do_realtime(ts); + if (likely(gtod->clock.vread)) + return do_realtime(ts); + break; case CLOCK_MONOTONIC: - return do_monotonic(ts); + if (likely(gtod->clock.vread)) + return do_monotonic(ts); + break; + case CLOCK_REALTIME_COARSE: + return do_realtime_coarse(ts); + case CLOCK_MONOTONIC_COARSE: + return do_monotonic_coarse(ts); } return vdso_fallback_gettime(clock, ts); } diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 7410640db17..3bb4fc21f4f 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -8,6 +8,7 @@ endif # Make sure early boot has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_enlighten.o := $(nostackp) +CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ @@ -16,3 +17,4 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o + diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index b53225d2cac..e133ce25e29 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -100,7 +100,7 @@ static int xen_array_release(struct inode *inode, struct file *file) return 0; } -static struct file_operations u32_array_fops = { +static const struct file_operations u32_array_fops = { .owner = THIS_MODULE, .open = u32_array_open, .release= xen_array_release, diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b62ccb840cf..dfbf70e6586 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -51,6 +51,7 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/reboot.h> +#include <asm/stackprotector.h> #include "xen-ops.h" #include "mmu.h" @@ -177,6 +178,7 @@ static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; static void xen_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { + unsigned maskebx = ~0; unsigned maskecx = ~0; unsigned maskedx = ~0; @@ -184,9 +186,16 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, * Mask out inconvenient features, to try and disable as many * unsupported kernel subsystems as possible. */ - if (*ax == 1) { + switch (*ax) { + case 1: maskecx = cpuid_leaf1_ecx_mask; maskedx = cpuid_leaf1_edx_mask; + break; + + case 0xb: + /* Suppress extended topology stuff */ + maskebx = 0; + break; } asm(XEN_EMULATE_PREFIX "cpuid" @@ -196,6 +205,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, "=d" (*dx) : "0" (*ax), "2" (*cx)); + *bx &= maskebx; *cx &= maskecx; *dx &= maskedx; } @@ -330,18 +340,28 @@ static void xen_load_gdt(const struct desc_ptr *dtr) unsigned long frames[pages]; int f; - /* A GDT can be up to 64k in size, which corresponds to 8192 - 8-byte entries, or 16 4k pages.. */ + /* + * A GDT can be up to 64k in size, which corresponds to 8192 + * 8-byte entries, or 16 4k pages.. + */ BUG_ON(size > 65536); BUG_ON(va & ~PAGE_MASK); for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { int level; - pte_t *ptep = lookup_address(va, &level); + pte_t *ptep; unsigned long pfn, mfn; void *virt; + /* + * The GDT is per-cpu and is in the percpu data area. + * That can be virtually mapped, so we need to do a + * page-walk to get the underlying MFN for the + * hypercall. The page can also be in the kernel's + * linear range, so we need to RO that mapping too. + */ + ptep = lookup_address(va, &level); BUG_ON(ptep == NULL); pfn = pte_pfn(*ptep); @@ -358,6 +378,44 @@ static void xen_load_gdt(const struct desc_ptr *dtr) BUG(); } +/* + * load_gdt for early boot, when the gdt is only mapped once + */ +static __init void xen_load_gdt_boot(const struct desc_ptr *dtr) +{ + unsigned long va = dtr->address; + unsigned int size = dtr->size + 1; + unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + unsigned long frames[pages]; + int f; + + /* + * A GDT can be up to 64k in size, which corresponds to 8192 + * 8-byte entries, or 16 4k pages.. + */ + + BUG_ON(size > 65536); + BUG_ON(va & ~PAGE_MASK); + + for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { + pte_t pte; + unsigned long pfn, mfn; + + pfn = virt_to_pfn(va); + mfn = pfn_to_mfn(pfn); + + pte = pfn_pte(pfn, PAGE_KERNEL_RO); + + if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) + BUG(); + + frames[f] = mfn; + } + + if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) + BUG(); +} + static void load_TLS_descriptor(struct thread_struct *t, unsigned int cpu, unsigned int i) { @@ -581,6 +639,29 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, preempt_enable(); } +/* + * Version of write_gdt_entry for use at early boot-time needed to + * update an entry as simply as possible. + */ +static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, + const void *desc, int type) +{ + switch (type) { + case DESC_LDT: + case DESC_TSS: + /* ignore */ + break; + + default: { + xmaddr_t maddr = virt_to_machine(&dt[entry]); + + if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) + dt[entry] = *(struct desc_struct *)desc; + } + + } +} + static void xen_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { @@ -840,19 +921,9 @@ static const struct pv_info xen_info __initdata = { static const struct pv_init_ops xen_init_ops __initdata = { .patch = xen_patch, - - .banner = xen_banner, - .memory_setup = xen_memory_setup, - .arch_setup = xen_arch_setup, - .post_allocator_init = xen_post_allocator_init, }; static const struct pv_time_ops xen_time_ops __initdata = { - .time_init = xen_time_init, - - .set_wallclock = xen_set_wallclock, - .get_wallclock = xen_get_wallclock, - .get_tsc_khz = xen_tsc_khz, .sched_clock = xen_sched_clock, }; @@ -918,8 +989,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC - .setup_boot_clock = paravirt_nop, - .setup_secondary_clock = paravirt_nop, .startup_ipi_hook = paravirt_nop, #endif }; @@ -965,6 +1034,23 @@ static const struct machine_ops __initdata xen_machine_ops = { .emergency_restart = xen_emergency_restart, }; +/* + * Set up the GDT and segment registers for -fstack-protector. Until + * we do this, we have to be careful not to call any stack-protected + * function, which is most of the kernel. + */ +static void __init xen_setup_stackprotector(void) +{ + pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; + pv_cpu_ops.load_gdt = xen_load_gdt_boot; + + setup_stack_canary_segment(0); + switch_to_new_gdt(0); + + pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry; + pv_cpu_ops.load_gdt = xen_load_gdt; +} + /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { @@ -981,16 +1067,49 @@ asmlinkage void __init xen_start_kernel(void) pv_time_ops = xen_time_ops; pv_cpu_ops = xen_cpu_ops; pv_apic_ops = xen_apic_ops; - pv_mmu_ops = xen_mmu_ops; -#ifdef CONFIG_X86_64 + x86_init.resources.memory_setup = xen_memory_setup; + x86_init.oem.arch_setup = xen_arch_setup; + x86_init.oem.banner = xen_banner; + + x86_init.timers.timer_init = xen_time_init; + x86_init.timers.setup_percpu_clockev = x86_init_noop; + x86_cpuinit.setup_percpu_clockev = x86_init_noop; + + x86_platform.calibrate_tsc = xen_tsc_khz; + x86_platform.get_wallclock = xen_get_wallclock; + x86_platform.set_wallclock = xen_set_wallclock; + /* - * Setup percpu state. We only need to do this for 64-bit - * because 32-bit already has %fs set properly. + * Set up some pagetable state before starting to set any ptes. */ - load_percpu_segment(0); + + xen_init_mmu_ops(); + + /* Prevent unwanted bits from being set in PTEs. */ + __supported_pte_mask &= ~_PAGE_GLOBAL; + if (!xen_initial_domain()) + __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); + + __supported_pte_mask |= _PAGE_IOMAP; + +#ifdef CONFIG_X86_64 + /* Work out if we support NX */ + check_efer(); #endif + xen_setup_features(); + + /* Get mfn list */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + xen_build_dynamic_phys_to_machine(); + + /* + * Set up kernel GDT and segment registers, mainly so that + * -fstack-protector code can be executed. + */ + xen_setup_stackprotector(); + xen_init_irq_ops(); xen_init_cpuid_mask(); @@ -1001,8 +1120,6 @@ asmlinkage void __init xen_start_kernel(void) set_xen_basic_apic_ops(); #endif - xen_setup_features(); - if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; @@ -1019,22 +1136,8 @@ asmlinkage void __init xen_start_kernel(void) xen_smp_init(); - /* Get mfn list */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) - xen_build_dynamic_phys_to_machine(); - pgd = (pgd_t *)xen_start_info->pt_base; - /* Prevent unwanted bits from being set in PTEs. */ - __supported_pte_mask &= ~_PAGE_GLOBAL; - if (!xen_initial_domain()) - __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); - -#ifdef CONFIG_X86_64 - /* Work out if we support NX */ - check_efer(); -#endif - /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index cfd17799bd6..9d30105a0c4 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -1,5 +1,7 @@ #include <linux/hardirq.h> +#include <asm/x86_init.h> + #include <xen/interface/xen.h> #include <xen/interface/sched.h> #include <xen/interface/vcpu.h> @@ -112,8 +114,6 @@ static void xen_halt(void) } static const struct pv_irq_ops xen_irq_ops __initdata = { - .init_IRQ = xen_init_IRQ, - .save_fl = PV_CALLEE_SAVE(xen_save_fl), .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), @@ -129,4 +129,5 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { void __init xen_init_irq_ops() { pv_irq_ops = xen_irq_ops; + x86_init.irqs.intr_init = xen_init_IRQ; } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 4ceb2858165..3bf7b1d250c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1165,14 +1165,14 @@ static void xen_drop_mm_ref(struct mm_struct *mm) /* Get the "official" set of cpus referring to our pagetable. */ if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { for_each_online_cpu(cpu) { - if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask) + if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) continue; smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); } return; } - cpumask_copy(mask, &mm->cpu_vm_mask); + cpumask_copy(mask, mm_cpumask(mm)); /* It's possible that a vcpu may have a stale reference to our cr3, because its in lazy mode, and it hasn't yet flushed @@ -1229,9 +1229,12 @@ static __init void xen_pagetable_setup_start(pgd_t *base) { } +static void xen_post_allocator_init(void); + static __init void xen_pagetable_setup_done(pgd_t *base) { xen_setup_shared_info(); + xen_post_allocator_init(); } static void xen_write_cr2(unsigned long cr2) @@ -1841,7 +1844,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif } -__init void xen_post_allocator_init(void) +static __init void xen_post_allocator_init(void) { pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; @@ -1875,10 +1878,7 @@ static void xen_leave_lazy_mmu(void) preempt_enable(); } -const struct pv_mmu_ops xen_mmu_ops __initdata = { - .pagetable_setup_start = xen_pagetable_setup_start, - .pagetable_setup_done = xen_pagetable_setup_done, - +static const struct pv_mmu_ops xen_mmu_ops __initdata = { .read_cr2 = xen_read_cr2, .write_cr2 = xen_write_cr2, @@ -1954,6 +1954,12 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = { .set_fixmap = xen_set_fixmap, }; +void __init xen_init_mmu_ops(void) +{ + x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; + x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; + pv_mmu_ops = xen_mmu_ops; +} #ifdef CONFIG_XEN_DEBUG_FS diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index da730262489..5fe6bc7f5ec 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -59,5 +59,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, unsigned long xen_read_cr2_direct(void); -extern const struct pv_mmu_ops xen_mmu_ops; +extern void xen_init_mmu_ops(void); #endif /* _XEN_MMU_H */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 429834ec168..fe03eeed7b4 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -236,6 +236,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->user_regs.ss = __KERNEL_DS; #ifdef CONFIG_X86_32 ctxt->user_regs.fs = __KERNEL_PERCPU; + ctxt->user_regs.gs = __KERNEL_STACK_CANARY; #else ctxt->gs_base_kernel = per_cpu_offset(cpu); #endif diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 5601506f2dd..36a5141108d 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -187,7 +187,6 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl struct xen_spinlock *prev; int irq = __get_cpu_var(lock_kicker_irq); int ret; - unsigned long flags; u64 start; /* If kicker interrupts not initialized yet, just spin */ @@ -199,16 +198,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl /* announce we're spinning */ prev = spinning_lock(xl); - flags = __raw_local_save_flags(); - if (irq_enable) { - ADD_STATS(taken_slow_irqenable, 1); - raw_local_irq_enable(); - } - ADD_STATS(taken_slow, 1); ADD_STATS(taken_slow_nested, prev != NULL); do { + unsigned long flags; + /* clear pending */ xen_clear_irq_pending(irq); @@ -228,6 +223,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl goto out; } + flags = __raw_local_save_flags(); + if (irq_enable) { + ADD_STATS(taken_slow_irqenable, 1); + raw_local_irq_enable(); + } + /* * Block until irq becomes pending. If we're * interrupted at this point (after the trylock but @@ -238,13 +239,15 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl * pending. */ xen_poll_irq(irq); + + raw_local_irq_restore(flags); + ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); out: - raw_local_irq_restore(flags); unspinning_lock(xl, prev); spin_time_accum_blocked(start); @@ -323,8 +326,13 @@ static void xen_spin_unlock(struct raw_spinlock *lock) smp_wmb(); /* make sure no writes get moved after unlock */ xl->lock = 0; /* release lock */ - /* make sure unlock happens before kick */ - barrier(); + /* + * Make sure unlock happens before checking for waiting + * spinners. We need a strong barrier to enforce the + * write-read ordering to different memory locations, as the + * CPU makes no implied guarantees about their ordering. + */ + mb(); if (unlikely(xl->spinners)) xen_spin_unlock_slow(xl); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 22494fd4c9b..355fa6b99c9 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -30,8 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_ident_map_ISA(void); void xen_reserve_top(void); -void xen_post_allocator_init(void); - char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void __init xen_init_IRQ(void); |