diff options
Diffstat (limited to 'arch/x86')
125 files changed, 2452 insertions, 1508 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1b2182b4d5c..32a1918e1b8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1332,7 +1332,9 @@ config MATH_EMULATION kernel, it won't hurt. config MTRR - bool "MTRR (Memory Type Range Register) support" + bool + default y + prompt "MTRR (Memory Type Range Register) support" if EMBEDDED ---help--- On Intel P6 family processors (Pentium Pro, Pentium II and later) the Memory Type Range Registers (MTRRs) may be used to control @@ -1398,7 +1400,8 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT config X86_PAT bool - prompt "x86 PAT support" + default y + prompt "x86 PAT support" if EMBEDDED depends on MTRR ---help--- Use PAT attributes to setup page level cache control. @@ -1604,7 +1607,7 @@ config COMPAT_VDSO depends on X86_32 || IA32_EMULATION ---help--- Map the 32-bit VDSO to the predictable old-style address too. - ---help--- + Say N here if you are running a sufficiently recent glibc version (2.3.3 or later), to remove the high-mapped VDSO mapping and to exclusively use the randomized VDSO. diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 5e99762eb5c..08e442bc3ab 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -301,15 +301,11 @@ config X86_CPU # # Define implied options from the CPU selection here -config X86_L1_CACHE_BYTES +config X86_INTERNODE_CACHE_SHIFT int - default "128" if MPSC - default "64" if GENERIC_CPU || MK8 || MCORE2 || MATOM || X86_32 - -config X86_INTERNODE_CACHE_BYTES - int - default "4096" if X86_VSMP - default X86_L1_CACHE_BYTES if !X86_VSMP + default "12" if X86_VSMP + default "7" if NUMA + default X86_L1_CACHE_SHIFT config X86_CMPXCHG def_bool X86_64 || (X86_32 && !M386) @@ -317,9 +313,9 @@ config X86_CMPXCHG config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC + default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU config X86_XADD def_bool y diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index cbf0776dbec..1255d953c65 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -46,6 +46,13 @@ cflags-$(CONFIG_MGEODE_LX) += $(call cc-option,-march=geode,-march=pentium-mmx) # cpu entries cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) +# Work around the pentium-mmx code generator madness of gcc4.4.x which +# does stack alignment by generating horrible code _before_ the mcount +# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph +# tracer assumptions. For i686, generic, core2 this is set by the +# compiler anyway +cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-maccumulate-outgoing-args) + # Bug fix for binutils: this option is required in order to keep # binutils from generating NOPL instructions against our will. ifneq ($(CONFIG_X86_P6_NOP),y) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 077e1b69198..faff0dc9c06 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -107,8 +107,7 @@ ENTRY(startup_32) lgdt gdt(%ebp) /* Enable PAE mode */ - xorl %eax, %eax - orl $(X86_CR4_PAE), %eax + movl $(X86_CR4_PAE), %eax movl %eax, %cr4 /* diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index f4193bb4878..a6f1a59a5b0 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -4,6 +4,7 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) #undef i386 +#include <asm/cache.h> #include <asm/page_types.h> #ifdef CONFIG_X86_64 @@ -46,7 +47,7 @@ SECTIONS *(.data.*) _edata = . ; } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(L1_CACHE_BYTES); .bss : { _bss = . ; *(.bss) diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index d42da380249..f767164cd5d 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -27,6 +27,12 @@ static void store_cursor_position(void) boot_params.screen_info.orig_x = oreg.dl; boot_params.screen_info.orig_y = oreg.dh; + + if (oreg.ch & 0x20) + boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR; + + if ((oreg.ch & 0x1f) > (oreg.cl & 0x1f)) + boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR; } static void store_video_mode(void) diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index cfb0010fa94..1a58ad89fdf 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o +obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o @@ -24,3 +25,5 @@ twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o + +ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index eb0566e8331..20bb0e1ac68 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -16,6 +16,7 @@ */ #include <linux/linkage.h> +#include <asm/inst.h> .text @@ -122,103 +123,72 @@ ENTRY(aesni_set_key) movups 0x10(%rsi), %xmm2 # other user key movaps %xmm2, (%rcx) add $0x10, %rcx - # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01 + AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 call _key_expansion_256a - # aeskeygenassist $0x1, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01 + AESKEYGENASSIST 0x1 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02 + AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 call _key_expansion_256a - # aeskeygenassist $0x2, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02 + AESKEYGENASSIST 0x2 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04 + AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 call _key_expansion_256a - # aeskeygenassist $0x4, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04 + AESKEYGENASSIST 0x4 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08 + AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 call _key_expansion_256a - # aeskeygenassist $0x8, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08 + AESKEYGENASSIST 0x8 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10 + AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 call _key_expansion_256a - # aeskeygenassist $0x10, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10 + AESKEYGENASSIST 0x10 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20 + AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 call _key_expansion_256a - # aeskeygenassist $0x20, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20 + AESKEYGENASSIST 0x20 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40 + AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 call _key_expansion_256a jmp .Ldec_key .Lenc_key192: movq 0x10(%rsi), %xmm2 # other user key - # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01 + AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 call _key_expansion_192a - # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02 + AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 call _key_expansion_192b - # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04 + AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 call _key_expansion_192a - # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08 + AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 call _key_expansion_192b - # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10 + AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 call _key_expansion_192a - # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20 + AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 call _key_expansion_192b - # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40 + AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 call _key_expansion_192a - # aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80 + AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 call _key_expansion_192b jmp .Ldec_key .Lenc_key128: - # aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01 + AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 call _key_expansion_128 - # aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02 + AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 call _key_expansion_128 - # aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04 + AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 call _key_expansion_128 - # aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08 + AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 call _key_expansion_128 - # aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10 + AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 call _key_expansion_128 - # aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20 + AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 call _key_expansion_128 - # aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40 + AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 call _key_expansion_128 - # aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80 + AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 call _key_expansion_128 - # aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b + AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 call _key_expansion_128 - # aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36 + AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 call _key_expansion_128 .Ldec_key: sub $0x10, %rcx @@ -231,8 +201,7 @@ ENTRY(aesni_set_key) .align 4 .Ldec_key_loop: movaps (%rdi), %xmm0 - # aesimc %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x38, 0xdb, 0xc8 + AESIMC %xmm0 %xmm1 movaps %xmm1, (%rsi) add $0x10, %rdi sub $0x10, %rsi @@ -274,51 +243,37 @@ _aesni_enc1: je .Lenc192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps -0x50(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE .align 4 .Lenc192: movaps -0x40(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps -0x30(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE .align 4 .Lenc128: movaps -0x20(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps -0x10(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps (TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x10(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x20(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x30(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x40(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x50(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x60(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x70(TKEYP), KEY - # aesenclast KEY, STATE # last round - .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2 + AESENCLAST KEY STATE ret /* @@ -353,135 +308,79 @@ _aesni_enc4: je .L4enc192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps -0x50(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 #.align 4 .L4enc192: movaps -0x40(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps -0x30(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 #.align 4 .L4enc128: movaps -0x20(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps -0x10(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps (TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x10(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x20(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x30(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x40(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x50(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x60(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x70(TKEYP), KEY - # aesenclast KEY, STATE1 # last round - .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2 - # aesenclast KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdd, 0xe2 - # aesenclast KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdd, 0xea - # aesenclast KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdd, 0xf2 + AESENCLAST KEY STATE1 # last round + AESENCLAST KEY STATE2 + AESENCLAST KEY STATE3 + AESENCLAST KEY STATE4 ret /* @@ -518,51 +417,37 @@ _aesni_dec1: je .Ldec192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps -0x50(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE .align 4 .Ldec192: movaps -0x40(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps -0x30(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE .align 4 .Ldec128: movaps -0x20(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps -0x10(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps (TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x10(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x20(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x30(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x40(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x50(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x60(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x70(TKEYP), KEY - # aesdeclast KEY, STATE # last round - .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2 + AESDECLAST KEY STATE ret /* @@ -597,135 +482,79 @@ _aesni_dec4: je .L4dec192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps -0x50(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 .align 4 .L4dec192: movaps -0x40(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps -0x30(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 .align 4 .L4dec128: movaps -0x20(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps -0x10(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps (TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x10(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x20(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x30(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x40(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x50(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x60(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x70(TKEYP), KEY - # aesdeclast KEY, STATE1 # last round - .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2 - # aesdeclast KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdf, 0xe2 - # aesdeclast KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdf, 0xea - # aesdeclast KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdf, 0xf2 + AESDECLAST KEY STATE1 # last round + AESDECLAST KEY STATE2 + AESDECLAST KEY STATE3 + AESDECLAST KEY STATE4 ret /* diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S new file mode 100644 index 00000000000..1eb7f90cb7b --- /dev/null +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -0,0 +1,157 @@ +/* + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI + * instructions. This file contains accelerated part of ghash + * implementation. More information about PCLMULQDQ can be found at: + * + * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ + * + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * Vinodh Gopal + * Erdinc Ozturk + * Deniz Karakoyunlu + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/inst.h> + +.data + +.align 16 +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f +.Lpoly: + .octa 0xc2000000000000000000000000000001 +.Ltwo_one: + .octa 0x00000001000000000000000000000001 + +#define DATA %xmm0 +#define SHASH %xmm1 +#define T1 %xmm2 +#define T2 %xmm3 +#define T3 %xmm4 +#define BSWAP %xmm5 +#define IN1 %xmm6 + +.text + +/* + * __clmul_gf128mul_ble: internal ABI + * input: + * DATA: operand1 + * SHASH: operand2, hash_key << 1 mod poly + * output: + * DATA: operand1 * operand2 mod poly + * changed: + * T1 + * T2 + * T3 + */ +__clmul_gf128mul_ble: + movaps DATA, T1 + pshufd $0b01001110, DATA, T2 + pshufd $0b01001110, SHASH, T3 + pxor DATA, T2 + pxor SHASH, T3 + + PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0 + PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1 + PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0) + pxor DATA, T2 + pxor T1, T2 # T2 = a0 * b1 + a1 * b0 + + movaps T2, T3 + pslldq $8, T3 + psrldq $8, T2 + pxor T3, DATA + pxor T2, T1 # <T1:DATA> is result of + # carry-less multiplication + + # first phase of the reduction + movaps DATA, T3 + psllq $1, T3 + pxor DATA, T3 + psllq $5, T3 + pxor DATA, T3 + psllq $57, T3 + movaps T3, T2 + pslldq $8, T2 + psrldq $8, T3 + pxor T2, DATA + pxor T3, T1 + + # second phase of the reduction + movaps DATA, T2 + psrlq $5, T2 + pxor DATA, T2 + psrlq $1, T2 + pxor DATA, T2 + psrlq $1, T2 + pxor T2, T1 + pxor T1, DATA + ret + +/* void clmul_ghash_mul(char *dst, const be128 *shash) */ +ENTRY(clmul_ghash_mul) + movups (%rdi), DATA + movups (%rsi), SHASH + movaps .Lbswap_mask, BSWAP + PSHUFB_XMM BSWAP DATA + call __clmul_gf128mul_ble + PSHUFB_XMM BSWAP DATA + movups DATA, (%rdi) + ret + +/* + * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, + * const be128 *shash); + */ +ENTRY(clmul_ghash_update) + cmp $16, %rdx + jb .Lupdate_just_ret # check length + movaps .Lbswap_mask, BSWAP + movups (%rdi), DATA + movups (%rcx), SHASH + PSHUFB_XMM BSWAP DATA +.align 4 +.Lupdate_loop: + movups (%rsi), IN1 + PSHUFB_XMM BSWAP IN1 + pxor IN1, DATA + call __clmul_gf128mul_ble + sub $16, %rdx + add $16, %rsi + cmp $16, %rdx + jge .Lupdate_loop + PSHUFB_XMM BSWAP DATA + movups DATA, (%rdi) +.Lupdate_just_ret: + ret + +/* + * void clmul_ghash_setkey(be128 *shash, const u8 *key); + * + * Calculate hash_key << 1 mod poly + */ +ENTRY(clmul_ghash_setkey) + movaps .Lbswap_mask, BSWAP + movups (%rsi), %xmm0 + PSHUFB_XMM BSWAP %xmm0 + movaps %xmm0, %xmm1 + psllq $1, %xmm0 + psrlq $63, %xmm1 + movaps %xmm1, %xmm2 + pslldq $8, %xmm1 + psrldq $8, %xmm2 + por %xmm1, %xmm0 + # reduction + pshufd $0b00100100, %xmm2, %xmm1 + pcmpeqd .Ltwo_one, %xmm1 + pand .Lpoly, %xmm1 + pxor %xmm1, %xmm0 + movups %xmm0, (%rdi) + ret diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c new file mode 100644 index 00000000000..cbcc8d8ea93 --- /dev/null +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -0,0 +1,333 @@ +/* + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI + * instructions. This file contains glue code. + * + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/crypto.h> +#include <crypto/algapi.h> +#include <crypto/cryptd.h> +#include <crypto/gf128mul.h> +#include <crypto/internal/hash.h> +#include <asm/i387.h> + +#define GHASH_BLOCK_SIZE 16 +#define GHASH_DIGEST_SIZE 16 + +void clmul_ghash_mul(char *dst, const be128 *shash); + +void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, + const be128 *shash); + +void clmul_ghash_setkey(be128 *shash, const u8 *key); + +struct ghash_async_ctx { + struct cryptd_ahash *cryptd_tfm; +}; + +struct ghash_ctx { + be128 shash; +}; + +struct ghash_desc_ctx { + u8 buffer[GHASH_BLOCK_SIZE]; + u32 bytes; +}; + +static int ghash_init(struct shash_desc *desc) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + + memset(dctx, 0, sizeof(*dctx)); + + return 0; +} + +static int ghash_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct ghash_ctx *ctx = crypto_shash_ctx(tfm); + + if (keylen != GHASH_BLOCK_SIZE) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + clmul_ghash_setkey(&ctx->shash, key); + + return 0; +} + +static int ghash_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + u8 *dst = dctx->buffer; + + kernel_fpu_begin(); + if (dctx->bytes) { + int n = min(srclen, dctx->bytes); + u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes); + + dctx->bytes -= n; + srclen -= n; + + while (n--) + *pos++ ^= *src++; + + if (!dctx->bytes) + clmul_ghash_mul(dst, &ctx->shash); + } + + clmul_ghash_update(dst, src, srclen, &ctx->shash); + kernel_fpu_end(); + + if (srclen & 0xf) { + src += srclen - (srclen & 0xf); + srclen &= 0xf; + dctx->bytes = GHASH_BLOCK_SIZE - srclen; + while (srclen--) + *dst++ ^= *src++; + } + + return 0; +} + +static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx) +{ + u8 *dst = dctx->buffer; + + if (dctx->bytes) { + u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes); + + while (dctx->bytes--) + *tmp++ ^= 0; + + kernel_fpu_begin(); + clmul_ghash_mul(dst, &ctx->shash); + kernel_fpu_end(); + } + + dctx->bytes = 0; +} + +static int ghash_final(struct shash_desc *desc, u8 *dst) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + u8 *buf = dctx->buffer; + + ghash_flush(ctx, dctx); + memcpy(dst, buf, GHASH_BLOCK_SIZE); + + return 0; +} + +static struct shash_alg ghash_alg = { + .digestsize = GHASH_DIGEST_SIZE, + .init = ghash_init, + .update = ghash_update, + .final = ghash_final, + .setkey = ghash_setkey, + .descsize = sizeof(struct ghash_desc_ctx), + .base = { + .cra_name = "__ghash", + .cra_driver_name = "__ghash-pclmulqdqni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct ghash_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ghash_alg.base.cra_list), + }, +}; + +static int ghash_async_init(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + if (!irq_fpu_usable()) { + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_init(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); + + desc->tfm = child; + desc->flags = req->base.flags; + return crypto_shash_init(desc); + } +} + +static int ghash_async_update(struct ahash_request *req) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + + if (!irq_fpu_usable()) { + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_update(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + return shash_ahash_update(req, desc); + } +} + +static int ghash_async_final(struct ahash_request *req) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + + if (!irq_fpu_usable()) { + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_final(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + return crypto_shash_final(desc, req->result); + } +} + +static int ghash_async_digest(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + if (!irq_fpu_usable()) { + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_digest(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); + + desc->tfm = child; + desc->flags = req->base.flags; + return shash_ahash_digest(req, desc); + } +} + +static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key, + unsigned int keylen) +{ + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct crypto_ahash *child = &ctx->cryptd_tfm->base; + int err; + + crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm) + & CRYPTO_TFM_REQ_MASK); + err = crypto_ahash_setkey(child, key, keylen); + crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child) + & CRYPTO_TFM_RES_MASK); + + return 0; +} + +static int ghash_async_init_tfm(struct crypto_tfm *tfm) +{ + struct cryptd_ahash *cryptd_tfm; + struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ctx->cryptd_tfm = cryptd_tfm; + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + crypto_ahash_reqsize(&cryptd_tfm->base)); + + return 0; +} + +static void ghash_async_exit_tfm(struct crypto_tfm *tfm) +{ + struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_free_ahash(ctx->cryptd_tfm); +} + +static struct ahash_alg ghash_async_alg = { + .init = ghash_async_init, + .update = ghash_async_update, + .final = ghash_async_final, + .setkey = ghash_async_setkey, + .digest = ghash_async_digest, + .halg = { + .digestsize = GHASH_DIGEST_SIZE, + .base = { + .cra_name = "ghash", + .cra_driver_name = "ghash-clmulni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC, + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_type = &crypto_ahash_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ghash_async_alg.halg.base.cra_list), + .cra_init = ghash_async_init_tfm, + .cra_exit = ghash_async_exit_tfm, + }, + }, +}; + +static int __init ghash_pclmulqdqni_mod_init(void) +{ + int err; + + if (!cpu_has_pclmulqdq) { + printk(KERN_INFO "Intel PCLMULQDQ-NI instructions are not" + " detected.\n"); + return -ENODEV; + } + + err = crypto_register_shash(&ghash_alg); + if (err) + goto err_out; + err = crypto_register_ahash(&ghash_async_alg); + if (err) + goto err_shash; + + return 0; + +err_shash: + crypto_unregister_shash(&ghash_alg); +err_out: + return err; +} + +static void __exit ghash_pclmulqdqni_mod_exit(void) +{ + crypto_unregister_ahash(&ghash_async_alg); + crypto_unregister_shash(&ghash_alg); +} + +module_init(ghash_pclmulqdqni_mod_init); +module_exit(ghash_pclmulqdqni_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("GHASH Message Digest Algorithm, " + "acclerated by PCLMULQDQ-NI"); +MODULE_ALIAS("ghash"); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 4eefdca9832..53147ad85b9 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -696,7 +696,7 @@ ia32_sys_call_table: .quad quiet_ni_syscall /* streams2 */ .quad stub32_vfork /* 190 */ .quad compat_sys_getrlimit - .quad sys32_mmap2 + .quad sys_mmap_pgoff .quad sys32_truncate64 .quad sys32_ftruncate64 .quad sys32_stat64 /* 195 */ diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index df82c0e48de..422572c7792 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -155,9 +155,6 @@ struct mmap_arg_struct { asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) { struct mmap_arg_struct a; - struct file *file = NULL; - unsigned long retval; - struct mm_struct *mm ; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; @@ -165,22 +162,8 @@ asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) if (a.offset & ~PAGE_MASK) return -EINVAL; - if (!(a.flags & MAP_ANONYMOUS)) { - file = fget(a.fd); - if (!file) - return -EBADF; - } - - mm = current->mm; - down_write(&mm->mmap_sem); - retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, + return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset>>PAGE_SHIFT); - if (file) - fput(file); - - up_write(&mm->mmap_sem); - - return retval; } asmlinkage long sys32_mprotect(unsigned long start, size_t len, @@ -483,30 +466,6 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd, return ret; } -asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct mm_struct *mm = current->mm; - unsigned long error; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - return -EBADF; - } - - down_write(&mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(&mm->mmap_sem); - - if (file) - fput(file); - return error; -} - asmlinkage long sys32_olduname(struct oldold_utsname __user *name) { char *arch = "x86_64"; diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 4518dc50090..60d2b2db0bc 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -118,7 +118,7 @@ extern void acpi_restore_state_mem(void); extern unsigned long acpi_wakeup_address; /* early initialization routine */ -extern void acpi_reserve_bootmem(void); +extern void acpi_reserve_wakeup_memory(void); /* * Check if the CPU can handle C2 and deeper @@ -158,6 +158,7 @@ struct bootnode; #ifdef CONFIG_ACPI_NUMA extern int acpi_numa; +extern int acpi_get_nodes(struct bootnode *physnodes); extern int acpi_scan_nodes(unsigned long start, unsigned long end); #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) extern void acpi_fake_nodes(const struct bootnode *fake_nodes, diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h index 549860d3be8..2f9047cfaac 100644 --- a/arch/x86/include/asm/cache.h +++ b/arch/x86/include/asm/cache.h @@ -9,12 +9,13 @@ #define __read_mostly __attribute__((__section__(".data.read_mostly"))) +#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT +#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT) + #ifdef CONFIG_X86_VSMP -/* vSMP Internode cacheline shift */ -#define INTERNODE_CACHE_SHIFT (12) #ifdef CONFIG_SMP #define __cacheline_aligned_in_smp \ - __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \ + __attribute__((__aligned__(INTERNODE_CACHE_BYTES))) \ __page_aligned_data #endif #endif diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 9076add593a..634c40a739a 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -177,6 +177,7 @@ void clflush_cache_range(void *addr, unsigned int size); #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void); extern const int rodata_test_data; +extern int kernel_set_to_readonly; void set_kernel_text_rw(void); void set_kernel_text_ro(void); #else diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 9cfc88b9774..613700f27a4 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -248,6 +248,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) +#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 9d6684849fd..278441f3985 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -12,9 +12,9 @@ #include <linux/types.h> /* - * FIXME: Acessing the desc_struct through its fields is more elegant, + * FIXME: Accessing the desc_struct through its fields is more elegant, * and should be the one valid thing to do. However, a lot of open code - * still touches the a and b acessors, and doing this allow us to do it + * still touches the a and b accessors, and doing this allow us to do it * incrementally. We keep the signature as a struct, rather than an union, * so we can get rid of it transparently in the future -- glommer */ diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 40b4e614fe7..761249e396f 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -61,6 +61,12 @@ struct e820map { struct e820entry map[E820_X_MAX]; }; +#define ISA_START_ADDRESS 0xa0000 +#define ISA_END_ADDRESS 0x100000 + +#define BIOS_BEGIN 0x000a0000 +#define BIOS_END 0x00100000 + #ifdef __KERNEL__ /* see comment in arch/x86/kernel/e820.c */ extern struct e820map e820; @@ -126,15 +132,18 @@ extern void e820_reserve_resources(void); extern void e820_reserve_resources_late(void); extern void setup_memory_map(void); extern char *default_machine_specific_memory_setup(void); -#endif /* __KERNEL__ */ -#endif /* __ASSEMBLY__ */ -#define ISA_START_ADDRESS 0xa0000 -#define ISA_END_ADDRESS 0x100000 -#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS) +/* + * Returns true iff the specified range [s,e) is completely contained inside + * the ISA region. + */ +static inline bool is_ISA_range(u64 s, u64 e) +{ + return s >= ISA_START_ADDRESS && e <= ISA_END_ADDRESS; +} -#define BIOS_BEGIN 0x000a0000 -#define BIOS_END 0x00100000 +#endif /* __KERNEL__ */ +#endif /* __ASSEMBLY__ */ #ifdef __KERNEL__ #include <linux/ioport.h> diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 456a304b817..8a024babe5e 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -157,19 +157,6 @@ do { \ #define compat_elf_check_arch(x) elf_check_arch_ia32(x) -static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp) -{ - loadsegment(fs, 0); - loadsegment(ds, __USER32_DS); - loadsegment(es, __USER32_DS); - load_gs_index(0); - regs->ip = ip; - regs->sp = sp; - regs->flags = X86_EFLAGS_IF; - regs->cs = __USER32_CS; - regs->ss = __USER32_DS; -} - static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) { @@ -191,11 +178,8 @@ do { \ #define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ elf_common_init(¤t->thread, regs, __USER_DS) -#define compat_start_thread(regs, ip, sp) \ -do { \ - start_ia32_thread(regs, ip, sp); \ - set_fs(USER_DS); \ -} while (0) +void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); +#define compat_start_thread start_thread_ia32 #define COMPAT_SET_PERSONALITY(ex) \ do { \ diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index f5693c81a1d..8e8ec663a98 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -34,7 +34,7 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7, smp_invalidate_interrupt) #endif -BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR) +BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) /* * every pentium local APIC has two 'local interrupts', with a diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 108eb6fd1ae..0f8576427cf 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -12,7 +12,7 @@ typedef struct { unsigned int apic_timer_irqs; /* arch dependent */ unsigned int irq_spurious_count; #endif - unsigned int generic_irqs; /* arch dependent */ + unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; unsigned int apic_pending_irqs; #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 1c22cb05ad6..5d89fd2a369 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -65,11 +65,12 @@ /* hpet memory map physical address */ extern unsigned long hpet_address; extern unsigned long force_hpet_address; +extern u8 hpet_blockid; extern int hpet_force_user; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern void hpet_disable(void); -extern unsigned long hpet_readl(unsigned long a); +extern unsigned int hpet_readl(unsigned int a); extern void force_hpet_resume(void); extern void hpet_msi_unmask(unsigned int irq); @@ -78,9 +79,9 @@ extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg); extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg); #ifdef CONFIG_PCI_MSI -extern int arch_setup_hpet_msi(unsigned int irq); +extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id); #else -static inline int arch_setup_hpet_msi(unsigned int irq) +static inline int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { return -EINVAL; } diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 6e124269fd4..08c48a81841 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -27,7 +27,7 @@ /* Interrupt handlers registered during init_IRQ */ extern void apic_timer_interrupt(void); -extern void generic_interrupt(void); +extern void x86_platform_ipi(void); extern void error_interrupt(void); extern void perf_pending_interrupt(void); @@ -119,7 +119,7 @@ extern void eisa_set_level_irq(unsigned int irq); /* SMP */ extern void smp_apic_timer_interrupt(struct pt_regs *); extern void smp_spurious_interrupt(struct pt_regs *); -extern void smp_generic_interrupt(struct pt_regs *); +extern void smp_x86_platform_ipi(struct pt_regs *); extern void smp_error_interrupt(struct pt_regs *); #ifdef CONFIG_X86_IO_APIC extern asmlinkage void smp_irq_move_cleanup_interrupt(void); diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 0b20bbb758f..ebfb8a9e11f 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -10,6 +10,8 @@ #ifndef _ASM_X86_I387_H #define _ASM_X86_I387_H +#ifndef __ASSEMBLY__ + #include <linux/sched.h> #include <linux/kernel_stat.h> #include <linux/regset.h> @@ -411,4 +413,9 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) } } +#endif /* __ASSEMBLY__ */ + +#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 +#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5 + #endif /* _ASM_X86_I387_H */ diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h new file mode 100644 index 00000000000..14cf526091f --- /dev/null +++ b/arch/x86/include/asm/inst.h @@ -0,0 +1,150 @@ +/* + * Generate .byte code for some instructions not supported by old + * binutils. + */ +#ifndef X86_ASM_INST_H +#define X86_ASM_INST_H + +#ifdef __ASSEMBLY__ + + .macro XMM_NUM opd xmm + .ifc \xmm,%xmm0 + \opd = 0 + .endif + .ifc \xmm,%xmm1 + \opd = 1 + .endif + .ifc \xmm,%xmm2 + \opd = 2 + .endif + .ifc \xmm,%xmm3 + \opd = 3 + .endif + .ifc \xmm,%xmm4 + \opd = 4 + .endif + .ifc \xmm,%xmm5 + \opd = 5 + .endif + .ifc \xmm,%xmm6 + \opd = 6 + .endif + .ifc \xmm,%xmm7 + \opd = 7 + .endif + .ifc \xmm,%xmm8 + \opd = 8 + .endif + .ifc \xmm,%xmm9 + \opd = 9 + .endif + .ifc \xmm,%xmm10 + \opd = 10 + .endif + .ifc \xmm,%xmm11 + \opd = 11 + .endif + .ifc \xmm,%xmm12 + \opd = 12 + .endif + .ifc \xmm,%xmm13 + \opd = 13 + .endif + .ifc \xmm,%xmm14 + \opd = 14 + .endif + .ifc \xmm,%xmm15 + \opd = 15 + .endif + .endm + + .macro PFX_OPD_SIZE + .byte 0x66 + .endm + + .macro PFX_REX opd1 opd2 + .if (\opd1 | \opd2) & 8 + .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) + .endif + .endm + + .macro MODRM mod opd1 opd2 + .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3) + .endm + + .macro PSHUFB_XMM xmm1 xmm2 + XMM_NUM pshufb_opd1 \xmm1 + XMM_NUM pshufb_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX pshufb_opd1 pshufb_opd2 + .byte 0x0f, 0x38, 0x00 + MODRM 0xc0 pshufb_opd1 pshufb_opd2 + .endm + + .macro PCLMULQDQ imm8 xmm1 xmm2 + XMM_NUM clmul_opd1 \xmm1 + XMM_NUM clmul_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX clmul_opd1 clmul_opd2 + .byte 0x0f, 0x3a, 0x44 + MODRM 0xc0 clmul_opd1 clmul_opd2 + .byte \imm8 + .endm + + .macro AESKEYGENASSIST rcon xmm1 xmm2 + XMM_NUM aeskeygen_opd1 \xmm1 + XMM_NUM aeskeygen_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aeskeygen_opd1 aeskeygen_opd2 + .byte 0x0f, 0x3a, 0xdf + MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2 + .byte \rcon + .endm + + .macro AESIMC xmm1 xmm2 + XMM_NUM aesimc_opd1 \xmm1 + XMM_NUM aesimc_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesimc_opd1 aesimc_opd2 + .byte 0x0f, 0x38, 0xdb + MODRM 0xc0 aesimc_opd1 aesimc_opd2 + .endm + + .macro AESENC xmm1 xmm2 + XMM_NUM aesenc_opd1 \xmm1 + XMM_NUM aesenc_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesenc_opd1 aesenc_opd2 + .byte 0x0f, 0x38, 0xdc + MODRM 0xc0 aesenc_opd1 aesenc_opd2 + .endm + + .macro AESENCLAST xmm1 xmm2 + XMM_NUM aesenclast_opd1 \xmm1 + XMM_NUM aesenclast_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesenclast_opd1 aesenclast_opd2 + .byte 0x0f, 0x38, 0xdd + MODRM 0xc0 aesenclast_opd1 aesenclast_opd2 + .endm + + .macro AESDEC xmm1 xmm2 + XMM_NUM aesdec_opd1 \xmm1 + XMM_NUM aesdec_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesdec_opd1 aesdec_opd2 + .byte 0x0f, 0x38, 0xde + MODRM 0xc0 aesdec_opd1 aesdec_opd2 + .endm + + .macro AESDECLAST xmm1 xmm2 + XMM_NUM aesdeclast_opd1 \xmm1 + XMM_NUM aesdeclast_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesdeclast_opd1 aesdeclast_opd2 + .byte 0x0f, 0x38, 0xdf + MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2 + .endm +#endif + +#endif diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index ffd700ff5dc..5458380b6ef 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -37,7 +37,7 @@ extern void fixup_irqs(void); extern void irq_force_complete_move(int); #endif -extern void (*generic_interrupt_extension)(void); +extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 5b21f0ec3df..6a635bd3986 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -106,7 +106,7 @@ /* * Generic system vector for platform specific use */ -#define GENERIC_INTERRUPT_VECTOR 0xed +#define X86_PLATFORM_IPI_VECTOR 0xed /* * Performance monitoring pending work vector: diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h index c2d1f3b58e5..f70e60071fe 100644 --- a/arch/x86/include/asm/k8.h +++ b/arch/x86/include/asm/k8.h @@ -4,13 +4,16 @@ #include <linux/pci.h> extern struct pci_device_id k8_nb_ids[]; +struct bootnode; extern int early_is_k8_nb(u32 value); extern struct pci_dev **k8_northbridges; extern int num_k8_northbridges; extern int cache_k8_northbridges(void); extern void k8_flush_garts(void); -extern int k8_scan_nodes(unsigned long start, unsigned long end); +extern int k8_get_nodes(struct bootnode *nodes); +extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); +extern int k8_scan_nodes(void); #ifdef CONFIG_K8_NB static inline struct pci_dev *node_to_k8_nb_misc(int node) diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index ef51b501e22..c24ca9a5645 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -12,6 +12,8 @@ struct device; enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; struct microcode_ops { + void (*init)(struct device *device); + void (*fini)(void); enum ucode_state (*request_microcode_user) (int cpu, const void __user *buf, size_t size); diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index ede6998bd92..91df7c51806 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -47,7 +47,7 @@ static inline void resume_map_numa_kva(pgd_t *pgd) {} /* * generic node memory support, the following assumptions apply: * - * 1) memory comes in 64Mb contigious chunks which are either present or not + * 1) memory comes in 64Mb contiguous chunks which are either present or not * 2) we will not have more than 64Gb in total * * for now assume that 64Gb is max amount of RAM for whole system diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 61d90b1331c..d8bf23a88d0 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -71,12 +71,7 @@ static inline void early_get_smp_config(void) static inline void find_smp_config(void) { - x86_init.mpparse.find_smp_config(1); -} - -static inline void early_find_smp_config(void) -{ - x86_init.mpparse.find_smp_config(0); + x86_init.mpparse.find_smp_config(); } #ifdef CONFIG_X86_MPPARSE @@ -89,7 +84,7 @@ extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str); # else # define default_mpc_oem_bus_info NULL # endif -extern void default_find_smp_config(unsigned int reserve); +extern void default_find_smp_config(void); extern void default_get_smp_config(unsigned int early); #else static inline void early_reserve_e820_mpc_new(void) { } @@ -97,7 +92,7 @@ static inline void early_reserve_e820_mpc_new(void) { } #define default_mpc_apic_id NULL #define default_smp_read_mpc_oem NULL #define default_mpc_oem_bus_info NULL -#define default_find_smp_config x86_init_uint_noop +#define default_find_smp_config x86_init_noop #define default_get_smp_config x86_init_uint_noop #endif diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 6473f5ccff8..642fe34b36a 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -49,7 +49,8 @@ extern unsigned long max_pfn_mapped; extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); -extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); +extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn, + int acpi, int k8); extern void free_initmem(void); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index b399988eee3..b4bf9a942ed 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -118,11 +118,27 @@ extern int __init pcibios_init(void); /* pci-mmconfig.c */ +/* "PCI MMCONFIG %04x [bus %02x-%02x]" */ +#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2) + +struct pci_mmcfg_region { + struct list_head list; + struct resource res; + u64 address; + char __iomem *virt; + u16 segment; + u8 start_bus; + u8 end_bus; + char name[PCI_MMCFG_RESOURCE_NAME_LEN]; +}; + extern int __init pci_mmcfg_arch_init(void); extern void __init pci_mmcfg_arch_free(void); +extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus); + +extern struct list_head pci_mmcfg_list; -extern struct acpi_mcfg_allocation *pci_mmcfg_config; -extern int pci_mmcfg_config_num; +#define PCI_MMCFG_BUS_OFFSET(bus) ((bus) << 20) /* * AMD Fam10h CPUs are buggy, and cannot access MMIO config space diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index af6fd360ab3..a34c785c5a6 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -16,6 +16,8 @@ #ifndef __ASSEMBLY__ +#include <asm/x86_init.h> + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. @@ -270,9 +272,9 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, unsigned long new_flags) { /* - * PAT type is always WB for ISA. So no need to check. + * PAT type is always WB for untracked ranges, so no need to check. */ - if (is_ISA_range(paddr, paddr + size - 1)) + if (x86_platform.is_untracked_pat_range(paddr, paddr + size)) return 1; /* diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 621f56d7312..4009f6534f5 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -5,18 +5,19 @@ /* misc architecture specific prototypes */ -extern void early_idt_handler(void); +void early_idt_handler(void); -extern void system_call(void); -extern void syscall_init(void); +void system_call(void); +void syscall_init(void); -extern void ia32_syscall(void); -extern void ia32_cstar_target(void); -extern void ia32_sysenter_target(void); +void ia32_syscall(void); +void ia32_cstar_target(void); +void ia32_sysenter_target(void); -extern void syscall32_cpu_init(void); +void syscall32_cpu_init(void); -extern void check_efer(void); +void x86_configure_nx(void); +void x86_report_nx(void); extern int reboot_force; diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 1b7ee5d673c..0a524242865 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -2,7 +2,13 @@ #define _ASM_X86_SECTIONS_H #include <asm-generic/sections.h> +#include <asm/uaccess.h> extern char __brk_base[], __brk_limit[]; +extern struct exception_table_entry __stop___ex_table[]; + +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +extern char __end_rodata_hpage_align[]; +#endif #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 72e5a449166..04459d25e66 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -124,7 +124,7 @@ struct sigcontext { * fpstate is really (struct _fpstate *) or (struct _xstate *) * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end - * of extended memory layout. See comments at the defintion of + * of extended memory layout. See comments at the definition of * (struct _fpx_sw_bytes) */ void __user *fpstate; /* zero when no FPU/extended context */ @@ -219,7 +219,7 @@ struct sigcontext { * fpstate is really (struct _fpstate *) or (struct _xstate *) * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end - * of extended memory layout. See comments at the defintion of + * of extended memory layout. See comments at the definition of * (struct _fpx_sw_bytes) */ void __user *fpstate; /* zero when no FPU/extended context */ diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 9af9decb38c..4a5a089e1c6 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -57,9 +57,6 @@ asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); asmlinkage long sys32_personality(unsigned long); asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); -asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, unsigned long); - struct oldold_utsname; struct old_utsname; asmlinkage long sys32_olduname(struct oldold_utsname __user *); diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 372b76edd63..1bb6e395881 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -55,8 +55,6 @@ struct sel_arg_struct; struct oldold_utsname; struct old_utsname; -asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, unsigned long); asmlinkage int old_mmap(struct mmap_arg_struct __user *); asmlinkage int old_select(struct sel_arg_struct __user *); asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 80e2984f521..b414d2b401f 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -55,7 +55,7 @@ #define DESC_STATUS_SOURCE_TIMEOUT 3 /* - * source side threshholds at which message retries print a warning + * source side thresholds at which message retries print a warning */ #define SOURCE_TIMEOUT_LIMIT 20 #define DESTINATION_TIMEOUT_LIMIT 20 diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index d8e71459f02..ea0e8ea15e1 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -26,7 +26,7 @@ struct x86_init_mpparse { void (*smp_read_mpc_oem)(struct mpc_table *mpc); void (*mpc_oem_pci_bus)(struct mpc_bus *m); void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); - void (*find_smp_config)(unsigned int reserve); + void (*find_smp_config)(void); void (*get_smp_config)(unsigned int early); }; @@ -125,12 +125,14 @@ struct x86_cpuinit_ops { * @calibrate_tsc: calibrate TSC * @get_wallclock: get time from HW clock like RTC etc. * @set_wallclock: set time back to HW clock + * @is_untracked_pat_range exclude from PAT logic */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long nowtime); void (*iommu_shutdown)(void); + bool (*is_untracked_pat_range)(u64 start, u64 end); }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index d5b7e90c0ed..396ff4cc8ed 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -37,31 +37,4 @@ extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; -enum xen_domain_type { - XEN_NATIVE, /* running on bare hardware */ - XEN_PV_DOMAIN, /* running in a PV domain */ - XEN_HVM_DOMAIN, /* running in a Xen hvm domain */ -}; - -#ifdef CONFIG_XEN -extern enum xen_domain_type xen_domain_type; -#else -#define xen_domain_type XEN_NATIVE -#endif - -#define xen_domain() (xen_domain_type != XEN_NATIVE) -#define xen_pv_domain() (xen_domain() && \ - xen_domain_type == XEN_PV_DOMAIN) -#define xen_hvm_domain() (xen_domain() && \ - xen_domain_type == XEN_HVM_DOMAIN) - -#ifdef CONFIG_XEN_DOM0 -#include <xen/interface/xen.h> - -#define xen_initial_domain() (xen_pv_domain() && \ - xen_start_info->flags & SIF_INITDOMAIN) -#else /* !CONFIG_XEN_DOM0 */ -#define xen_initial_domain() (0) -#endif /* CONFIG_XEN_DOM0 */ - #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 67e929b8987..fb1035cd9a6 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -624,6 +624,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) } hpet_address = hpet_tbl->address.address; + hpet_blockid = hpet_tbl->sequence; /* * Some broken BIOSes advertise HPET at 0x0. We really do not @@ -1122,7 +1123,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) if (!acpi_sci_override_gsi) acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); - /* Fill in identity legacy mapings where no override */ + /* Fill in identity legacy mappings where no override */ mp_config_acpi_legacy_irqs(); count = diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index ca93638ba43..82e508677b9 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -78,12 +78,9 @@ int acpi_save_state_mem(void) #ifndef CONFIG_64BIT store_gdt((struct desc_ptr *)&header->pmode_gdt); - header->pmode_efer_low = nx_enabled; - if (header->pmode_efer_low & 1) { - /* This is strange, why not save efer, always? */ - rdmsr(MSR_EFER, header->pmode_efer_low, - header->pmode_efer_high); - } + if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low, + &header->pmode_efer_high)) + header->pmode_efer_low = header->pmode_efer_high = 0; #endif /* !CONFIG_64BIT */ header->pmode_cr0 = read_cr0(); @@ -119,29 +116,32 @@ void acpi_restore_state_mem(void) /** - * acpi_reserve_bootmem - do _very_ early ACPI initialisation + * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation * * We allocate a page from the first 1MB of memory for the wakeup * routine for when we come back from a sleep state. The * runtime allocator allows specification of <16MB pages, but not * <1MB pages. */ -void __init acpi_reserve_bootmem(void) +void __init acpi_reserve_wakeup_memory(void) { + unsigned long mem; + if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { printk(KERN_ERR "ACPI: Wakeup code way too big, S3 disabled.\n"); return; } - acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE); + mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE); - if (!acpi_realmode) { + if (mem == -1L) { printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); return; } - - acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); + acpi_realmode = (unsigned long) phys_to_virt(mem); + acpi_wakeup_address = mem; + reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); } diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 32fb09102a1..1c0fb4d4ad5 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1783,7 +1783,7 @@ retry: goto out; /* - * aperture was sucessfully enlarged by 128 MB, try + * aperture was successfully enlarged by 128 MB, try * allocation again */ goto retry; @@ -2490,7 +2490,7 @@ int __init amd_iommu_init_passthrough(void) struct pci_dev *dev = NULL; u16 devid; - /* allocate passthroug domain */ + /* allocate passthrough domain */ pt_domain = protection_domain_alloc(); if (!pt_domain) return -ENOMEM; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 7ffc3996523..9c4a6f74755 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1336,6 +1336,9 @@ void __init amd_iommu_detect(void) iommu_detected = 1; amd_iommu_detected = 1; x86_init.iommu.iommu_init = amd_iommu_init; + + /* Make sure ACS will be enabled */ + pci_request_acs(); } } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ad8c75b9e45..efb2b9cd132 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -647,7 +647,7 @@ static int __init calibrate_APIC_clock(void) calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); - apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", calibration_result); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c0b4468683f..d5d498fbee4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3267,7 +3267,8 @@ void destroy_irq(unsigned int irq) * MSI message composition */ #ifdef CONFIG_PCI_MSI -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, + struct msi_msg *msg, u8 hpet_id) { struct irq_cfg *cfg; int err; @@ -3301,7 +3302,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms irte.dest_id = IRTE_DEST(dest); /* Set source-id of interrupt request */ - set_msi_sid(&irte, pdev); + if (pdev) + set_msi_sid(&irte, pdev); + else + set_hpet_sid(&irte, hpet_id); modify_irte(irq, &irte); @@ -3466,7 +3470,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) int ret; struct msi_msg msg; - ret = msi_compose_msg(dev, irq, &msg); + ret = msi_compose_msg(dev, irq, &msg, -1); if (ret < 0) return ret; @@ -3599,7 +3603,7 @@ int arch_setup_dmar_msi(unsigned int irq) int ret; struct msi_msg msg; - ret = msi_compose_msg(NULL, irq, &msg); + ret = msi_compose_msg(NULL, irq, &msg, -1); if (ret < 0) return ret; dmar_msi_write(irq, &msg); @@ -3639,6 +3643,19 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) #endif /* CONFIG_SMP */ +static struct irq_chip ir_hpet_msi_type = { + .name = "IR-HPET_MSI", + .unmask = hpet_msi_unmask, + .mask = hpet_msi_mask, +#ifdef CONFIG_INTR_REMAP + .ack = ir_ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = ir_set_msi_irq_affinity, +#endif +#endif + .retrigger = ioapic_retrigger_irq, +}; + static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", .unmask = hpet_msi_unmask, @@ -3650,20 +3667,36 @@ static struct irq_chip hpet_msi_type = { .retrigger = ioapic_retrigger_irq, }; -int arch_setup_hpet_msi(unsigned int irq) +int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { int ret; struct msi_msg msg; struct irq_desc *desc = irq_to_desc(irq); - ret = msi_compose_msg(NULL, irq, &msg); + if (intr_remapping_enabled) { + struct intel_iommu *iommu = map_hpet_to_ir(id); + int index; + + if (!iommu) + return -1; + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + return -1; + } + + ret = msi_compose_msg(NULL, irq, &msg, id); if (ret < 0) return ret; hpet_msi_write(irq, &msg); desc->status |= IRQ_MOVE_PCNTXT; - set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, - "edge"); + if (irq_remapped(irq)) + set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, + handle_edge_irq, "edge"); + else + set_irq_chip_and_handler_name(irq, &hpet_msi_type, + handle_edge_irq, "edge"); return 0; } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 07cdbdcd7a9..98c4665f251 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -264,11 +264,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) static __init void early_check_numaq(void) { /* - * Find possible boot-time SMP configuration: - */ - early_find_smp_config(); - - /* * get boot-time SMP configuration: */ if (smp_found_config) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 130c4b93487..b684bb303cb 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -30,10 +30,22 @@ #include <asm/apic.h> #include <asm/ipi.h> #include <asm/smp.h> +#include <asm/x86_init.h> DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; +static u64 gru_start_paddr, gru_end_paddr; + +static inline bool is_GRU_range(u64 start, u64 end) +{ + return start >= gru_start_paddr && end <= gru_end_paddr; +} + +static bool uv_is_untracked_pat_range(u64 start, u64 end) +{ + return is_ISA_range(start, end) || is_GRU_range(start, end); +} static int early_get_nodeid(void) { @@ -49,6 +61,7 @@ static int early_get_nodeid(void) static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strcmp(oem_id, "SGI")) { + x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; if (!strcmp(oem_table_id, "UVL")) uv_system_type = UV_LEGACY_APIC; else if (!strcmp(oem_table_id, "UVX")) @@ -385,8 +398,12 @@ static __init void map_gru_high(int max_pnode) int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); - if (gru.s.enable) + if (gru.s.enable) { map_high("GRU", gru.s.base, shift, max_pnode, map_wb); + gru_start_paddr = ((u64)gru.s.base << shift); + gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); + + } } static __init void map_mmr_high(int max_pnode) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a4ec8b64754..c1afa990a6c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1136,7 +1136,7 @@ void __cpuinit cpu_init(void) wrmsrl(MSR_KERNEL_GS_BASE, 0); barrier(); - check_efer(); + x86_configure_nx(); if (cpu != 0) enable_x2apic(); diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 8b581d3905c..d2e7c77c1ea 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -764,14 +764,15 @@ static struct freq_attr *acpi_cpufreq_attr[] = { }; static struct cpufreq_driver acpi_cpufreq_driver = { - .verify = acpi_cpufreq_verify, - .target = acpi_cpufreq_target, - .init = acpi_cpufreq_cpu_init, - .exit = acpi_cpufreq_cpu_exit, - .resume = acpi_cpufreq_resume, - .name = "acpi-cpufreq", - .owner = THIS_MODULE, - .attr = acpi_cpufreq_attr, + .verify = acpi_cpufreq_verify, + .target = acpi_cpufreq_target, + .bios_limit = acpi_processor_get_bios_limit, + .init = acpi_cpufreq_cpu_init, + .exit = acpi_cpufreq_cpu_exit, + .resume = acpi_cpufreq_resume, + .name = "acpi-cpufreq", + .owner = THIS_MODULE, + .attr = acpi_cpufreq_attr, }; static int __init acpi_cpufreq_init(void) diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index cabd2fa3fc9..7e7eea4f826 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -885,7 +885,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) /* Find ACPI data for processor */ acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, &longhaul_walk_callback, + ACPI_UINT32_MAX, &longhaul_walk_callback, NULL, NULL, (void *)&pr); /* Check ACPI support for C3 state */ diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index f10dea409f4..cb01dac267d 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -164,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) } /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + policy->cpuinfo.transition_latency = 200000; policy->cur = busfreq * max_multiplier; result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index d47c775eb0a..9a97116f89e 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -714,14 +714,17 @@ static struct freq_attr *powernow_table_attr[] = { }; static struct cpufreq_driver powernow_driver = { - .verify = powernow_verify, - .target = powernow_target, - .get = powernow_get, - .init = powernow_cpu_init, - .exit = powernow_cpu_exit, - .name = "powernow-k7", - .owner = THIS_MODULE, - .attr = powernow_table_attr, + .verify = powernow_verify, + .target = powernow_target, + .get = powernow_get, +#ifdef CONFIG_X86_POWERNOW_K7_ACPI + .bios_limit = acpi_processor_get_bios_limit, +#endif + .init = powernow_cpu_init, + .exit = powernow_cpu_exit, + .name = "powernow-k7", + .owner = THIS_MODULE, + .attr = powernow_table_attr, }; static int __init powernow_init(void) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 3f12dabeab5..a9df9441a9a 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1118,7 +1118,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) { - cpumask_t oldmask; + cpumask_var_t oldmask; struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); u32 checkfid; u32 checkvid; @@ -1131,9 +1131,13 @@ static int powernowk8_target(struct cpufreq_policy *pol, checkfid = data->currfid; checkvid = data->currvid; - /* only run on specific CPU from here on */ - oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); + /* only run on specific CPU from here on. */ + /* This is poor form: use a workqueue or smp_call_function_single */ + if (!alloc_cpumask_var(&oldmask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_copy(oldmask, tsk_cpumask(current)); + set_cpus_allowed_ptr(current, cpumask_of(pol->cpu)); if (smp_processor_id() != pol->cpu) { printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); @@ -1193,7 +1197,8 @@ static int powernowk8_target(struct cpufreq_policy *pol, ret = 0; err_out: - set_cpus_allowed_ptr(current, &oldmask); + set_cpus_allowed_ptr(current, oldmask); + free_cpumask_var(oldmask); return ret; } @@ -1393,14 +1398,15 @@ static struct freq_attr *powernow_k8_attr[] = { }; static struct cpufreq_driver cpufreq_amd64_driver = { - .verify = powernowk8_verify, - .target = powernowk8_target, - .init = powernowk8_cpu_init, - .exit = __devexit_p(powernowk8_cpu_exit), - .get = powernowk8_get, - .name = "powernow-k8", - .owner = THIS_MODULE, - .attr = powernow_k8_attr, + .verify = powernowk8_verify, + .target = powernowk8_target, + .bios_limit = acpi_processor_get_bios_limit, + .init = powernowk8_cpu_init, + .exit = __devexit_p(powernowk8_cpu_exit), + .get = powernowk8_get, + .name = "powernow-k8", + .owner = THIS_MODULE, + .attr = powernow_k8_attr, }; /* driver entry point for init */ diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 3ae5a7a3a50..2ce8e0b5cc5 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -39,7 +39,7 @@ static struct pci_dev *speedstep_chipset_dev; /* speedstep_processor */ -static unsigned int speedstep_processor; +static enum speedstep_processor speedstep_processor; static u32 pmbase; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index f4c290b8482..ad0083abfa2 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -34,7 +34,7 @@ static int relaxed_check; * GET PROCESSOR CORE SPEED IN KHZ * *********************************************************************/ -static unsigned int pentium3_get_frequency(unsigned int processor) +static unsigned int pentium3_get_frequency(enum speedstep_processor processor) { /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ struct { @@ -227,7 +227,7 @@ static unsigned int pentium4_get_frequency(void) /* Warning: may get called from smp_call_function_single. */ -unsigned int speedstep_get_frequency(unsigned int processor) +unsigned int speedstep_get_frequency(enum speedstep_processor processor) { switch (processor) { case SPEEDSTEP_CPU_PCORE: @@ -380,7 +380,7 @@ EXPORT_SYMBOL_GPL(speedstep_detect_processor); * DETECT SPEEDSTEP SPEEDS * *********************************************************************/ -unsigned int speedstep_get_freqs(unsigned int processor, +unsigned int speedstep_get_freqs(enum speedstep_processor processor, unsigned int *low_speed, unsigned int *high_speed, unsigned int *transition_latency, diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h index 2b6c04e5a30..70d9cea1219 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h @@ -11,18 +11,18 @@ /* processors */ - -#define SPEEDSTEP_CPU_PIII_C_EARLY 0x00000001 /* Coppermine core */ -#define SPEEDSTEP_CPU_PIII_C 0x00000002 /* Coppermine core */ -#define SPEEDSTEP_CPU_PIII_T 0x00000003 /* Tualatin core */ -#define SPEEDSTEP_CPU_P4M 0x00000004 /* P4-M */ - +enum speedstep_processor { + SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */ + SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */ + SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */ + SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */ /* the following processors are not speedstep-capable and are not auto-detected * in speedstep_detect_processor(). However, their speed can be detected using * the speedstep_get_frequency() call. */ -#define SPEEDSTEP_CPU_PM 0xFFFFFF03 /* Pentium M */ -#define SPEEDSTEP_CPU_P4D 0xFFFFFF04 /* desktop P4 */ -#define SPEEDSTEP_CPU_PCORE 0xFFFFFF05 /* Core */ + SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */ + SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */ + SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */ +}; /* speedstep states -- only two of them */ @@ -31,10 +31,10 @@ /* detect a speedstep-capable processor */ -extern unsigned int speedstep_detect_processor (void); +extern enum speedstep_processor speedstep_detect_processor(void); /* detect the current speed (in khz) of the processor */ -extern unsigned int speedstep_get_frequency(unsigned int processor); +extern unsigned int speedstep_get_frequency(enum speedstep_processor processor); /* detect the low and high speeds of the processor. The callback @@ -42,7 +42,7 @@ extern unsigned int speedstep_get_frequency(unsigned int processor); * SPEEDSTEP_LOW; the second argument is zero so that no * cpufreq_notify_transition calls are initiated. */ -extern unsigned int speedstep_get_freqs(unsigned int processor, +extern unsigned int speedstep_get_freqs(enum speedstep_processor processor, unsigned int *low_speed, unsigned int *high_speed, unsigned int *transition_latency, diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c index befea088e4f..04d73c114e4 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c @@ -35,7 +35,7 @@ static int smi_cmd; static unsigned int smi_sig; /* info about the processor */ -static unsigned int speedstep_processor; +static enum speedstep_processor speedstep_processor; /* * There are only two frequency states for each processor. Values diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 40e1835b35e..c900b73f922 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -263,8 +263,12 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) /* Don't do the funky fallback heuristics the AMD version employs for now. */ node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE || !node_online(node)) + if (node == NUMA_NO_NODE) node = first_node(node_online_map); + else if (!node_online(node)) { + /* reuse the value from init_cpu_to_node() */ + node = cpu_to_node(cpu); + } numa_set_node(cpu, node); printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 0df4c2b7107..6c40f6b5b34 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -94,7 +94,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ - { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */ + { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */ { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ @@ -102,6 +102,9 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ + { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */ + { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */ + { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */ { 0x00, 0, 0} }; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0bcaa387586..d7ebf25d10e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2016,9 +2016,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - t->expires = round_jiffies(jiffies + + if (!mce_ignore_ce && check_interval) { + t->expires = round_jiffies(jiffies + __get_cpu_var(mce_next_interval)); - add_timer_on(t, cpu); + add_timer_on(t, cpu); + } smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; case CPU_POST_DEAD: diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 73c86db5acb..09b1698e046 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -170,6 +170,41 @@ static int __init cmp_range(const void *x1, const void *x2) return start1 - start2; } +static int __init clean_sort_range(struct res_range *range, int az) +{ + int i, j, k = az - 1, nr_range = 0; + + for (i = 0; i < k; i++) { + if (range[i].end) + continue; + for (j = k; j > i; j--) { + if (range[j].end) { + k = j; + break; + } + } + if (j == i) + break; + range[i].start = range[k].start; + range[i].end = range[k].end; + range[k].start = 0; + range[k].end = 0; + k--; + } + /* count it */ + for (i = 0; i < az; i++) { + if (!range[i].end) { + nr_range = i; + break; + } + } + + /* sort them */ + sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + + return nr_range; +} + #define BIOS_BUG_MSG KERN_WARNING \ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" @@ -223,22 +258,18 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, subtract_range(range, extra_remove_base, extra_remove_base + extra_remove_size - 1); - /* get new range num */ - nr_range = 0; - for (i = 0; i < RANGE_NUM; i++) { - if (!range[i].end) - continue; - nr_range++; - } if (debug_print) { printk(KERN_DEBUG "After UC checking\n"); - for (i = 0; i < nr_range; i++) + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); + } } /* sort the ranges */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + nr_range = clean_sort_range(range, RANGE_NUM); if (debug_print) { printk(KERN_DEBUG "After sorting\n"); for (i = 0; i < nr_range; i++) @@ -689,8 +720,6 @@ static int __init mtrr_need_cleanup(void) continue; if (!size) type = MTRR_NUM_TYPES; - if (type == MTRR_TYPE_WRPROT) - type = MTRR_TYPE_UNCACHABLE; num[type]++; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c1bbed1021d..ab1a8a89b98 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1286,7 +1286,7 @@ x86_perf_event_set_period(struct perf_event *event, return 0; /* - * If we are way outside a reasoable range then just skip forward: + * If we are way outside a reasonable range then just skip forward: */ if (unlikely(left <= -period)) { left = period; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 4deb8fc849d..63bca794c8f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -977,8 +977,8 @@ apicinterrupt UV_BAU_MESSAGE \ #endif apicinterrupt LOCAL_TIMER_VECTOR \ apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt GENERIC_INTERRUPT_VECTOR \ - generic_interrupt smp_generic_interrupt +apicinterrupt X86_PLATFORM_IPI_VECTOR \ + x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 5a1b9758fd6..30968924543 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -189,9 +189,26 @@ static void wait_for_nmi(void) nmi_wait_count++; } +static inline int +within(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr < end; +} + static int do_ftrace_mod_code(unsigned long ip, void *new_code) { + /* + * On x86_64, kernel text mappings are mapped read-only with + * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead + * of the kernel text mapping to modify the kernel text. + * + * For 32bit kernels, these mappings are same and we can use + * kernel identity mapping to modify code. + */ + if (within(ip, (unsigned long)_text, (unsigned long)_etext)) + ip = (unsigned long)__va(__pa(ip)); + mod_code_ip = (void *)ip; mod_code_newcode = new_code; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 050c278481b..7fd318bac59 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -18,6 +18,8 @@ #include <asm/asm-offsets.h> #include <asm/setup.h> #include <asm/processor-flags.h> +#include <asm/msr-index.h> +#include <asm/cpufeature.h> #include <asm/percpu.h> /* Physical address */ @@ -297,25 +299,27 @@ ENTRY(startup_32_smp) orl %edx,%eax movl %eax,%cr4 - btl $5, %eax # check if PAE is enabled - jnc 6f + testb $X86_CR4_PAE, %al # check if PAE is enabled + jz 6f /* Check if extended functions are implemented */ movl $0x80000000, %eax cpuid - cmpl $0x80000000, %eax - jbe 6f + /* Value must be in the range 0x80000001 to 0x8000ffff */ + subl $0x80000001, %eax + cmpl $(0x8000ffff-0x80000001), %eax + ja 6f mov $0x80000001, %eax cpuid /* Execute Disable bit supported? */ - btl $20, %edx + btl $(X86_FEATURE_NX & 31), %edx jnc 6f /* Setup EFER (Extended Feature Enable Register) */ - movl $0xc0000080, %ecx + movl $MSR_EFER, %ecx rdmsr - btsl $11, %eax + btsl $_EFER_NX, %eax /* Make changes effective */ wrmsr diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 22db86a3764..2d8b5035371 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -262,11 +262,11 @@ ENTRY(secondary_startup_64) .quad x86_64_start_kernel ENTRY(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) - __FINITDATA ENTRY(stack_start) .quad init_thread_union+THREAD_SIZE-8 .word 0 + __FINITDATA bad_address: jmp bad_address @@ -340,6 +340,7 @@ ENTRY(name) i = i + 1 ; \ .endr + .data /* * This default setting generates an ident mapping at address 0x100000 * and a mapping for the kernel that precisely maps virtual address diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index dedc2bddf7a..ba6e6588460 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -33,6 +33,7 @@ * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; +u8 hpet_blockid; /* OS timer block num */ #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; #endif @@ -47,12 +48,12 @@ struct hpet_dev { char name[10]; }; -unsigned long hpet_readl(unsigned long a) +inline unsigned int hpet_readl(unsigned int a) { return readl(hpet_virt_address + a); } -static inline void hpet_writel(unsigned long d, unsigned long a) +static inline void hpet_writel(unsigned int d, unsigned int a) { writel(d, hpet_virt_address + a); } @@ -167,7 +168,7 @@ do { \ static void hpet_reserve_msi_timers(struct hpet_data *hd); -static void hpet_reserve_platform_timers(unsigned long id) +static void hpet_reserve_platform_timers(unsigned int id) { struct hpet __iomem *hpet = hpet_virt_address; struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; @@ -205,7 +206,7 @@ static void hpet_reserve_platform_timers(unsigned long id) } #else -static void hpet_reserve_platform_timers(unsigned long id) { } +static void hpet_reserve_platform_timers(unsigned int id) { } #endif /* @@ -246,7 +247,7 @@ static void hpet_reset_counter(void) static void hpet_start_counter(void) { - unsigned long cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG); cfg |= HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); } @@ -271,7 +272,7 @@ static void hpet_resume_counter(void) static void hpet_enable_legacy_int(void) { - unsigned long cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG); cfg |= HPET_CFG_LEGACY; hpet_writel(cfg, HPET_CFG); @@ -314,7 +315,7 @@ static int hpet_setup_msi_irq(unsigned int irq); static void hpet_set_mode(enum clock_event_mode mode, struct clock_event_device *evt, int timer) { - unsigned long cfg, cmp, now; + unsigned int cfg, cmp, now; uint64_t delta; switch (mode) { @@ -323,7 +324,7 @@ static void hpet_set_mode(enum clock_event_mode mode, delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; delta >>= evt->shift; now = hpet_readl(HPET_COUNTER); - cmp = now + (unsigned long) delta; + cmp = now + (unsigned int) delta; cfg = hpet_readl(HPET_Tn_CFG(timer)); /* Make sure we use edge triggered interrupts */ cfg &= ~HPET_TN_LEVEL; @@ -339,7 +340,7 @@ static void hpet_set_mode(enum clock_event_mode mode, * (See AMD-8111 HyperTransport I/O Hub Data Sheet, * Publication # 24674) */ - hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); + hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer)); hpet_start_counter(); hpet_print_config(); break; @@ -383,13 +384,24 @@ static int hpet_next_event(unsigned long delta, hpet_writel(cnt, HPET_Tn_CMP(timer)); /* - * We need to read back the CMP register to make sure that - * what we wrote hit the chip before we compare it to the - * counter. + * We need to read back the CMP register on certain HPET + * implementations (ATI chipsets) which seem to delay the + * transfer of the compare register into the internal compare + * logic. With small deltas this might actually be too late as + * the counter could already be higher than the compare value + * at that point and we would wait for the next hpet interrupt + * forever. We found out that reading the CMP register back + * forces the transfer so we can rely on the comparison with + * the counter register below. If the read back from the + * compare register does not match the value we programmed + * then we might have a real hardware problem. We can not do + * much about it here, but at least alert the user/admin with + * a prominent warning. */ - WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt); + WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, + KERN_WARNING "hpet: compare register read back failed.\n"); - return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; + return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } static void hpet_legacy_set_mode(enum clock_event_mode mode, @@ -415,7 +427,7 @@ static struct hpet_dev *hpet_devs; void hpet_msi_unmask(unsigned int irq) { struct hpet_dev *hdev = get_irq_data(irq); - unsigned long cfg; + unsigned int cfg; /* unmask it */ cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); @@ -425,7 +437,7 @@ void hpet_msi_unmask(unsigned int irq) void hpet_msi_mask(unsigned int irq) { - unsigned long cfg; + unsigned int cfg; struct hpet_dev *hdev = get_irq_data(irq); /* mask it */ @@ -467,7 +479,7 @@ static int hpet_msi_next_event(unsigned long delta, static int hpet_setup_msi_irq(unsigned int irq) { - if (arch_setup_hpet_msi(irq)) { + if (arch_setup_hpet_msi(irq, hpet_blockid)) { destroy_irq(irq); return -EINVAL; } @@ -584,6 +596,8 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) unsigned int num_timers_used = 0; int i; + if (boot_cpu_has(X86_FEATURE_ARAT)) + return; id = hpet_readl(HPET_ID); num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); @@ -598,7 +612,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { struct hpet_dev *hdev = &hpet_devs[num_timers_used]; - unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); + unsigned int cfg = hpet_readl(HPET_Tn_CFG(i)); /* Only consider HPET timer with MSI support */ if (!(cfg & HPET_TN_FSB_CAP)) @@ -813,7 +827,7 @@ static int hpet_clocksource_register(void) */ int __init hpet_enable(void) { - unsigned long id; + unsigned int id; int i; if (!is_hpet_capable()) @@ -872,10 +886,8 @@ int __init hpet_enable(void) if (id & HPET_ID_LEGSUP) { hpet_legacy_clockevent_register(); - hpet_msi_capability_lookup(2); return 1; } - hpet_msi_capability_lookup(0); return 0; out_nohpet: @@ -908,9 +920,17 @@ static __init int hpet_late_init(void) if (!hpet_virt_address) return -ENODEV; + if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP) + hpet_msi_capability_lookup(2); + else + hpet_msi_capability_lookup(0); + hpet_reserve_platform_timers(hpet_readl(HPET_ID)); hpet_print_config(); + if (boot_cpu_has(X86_FEATURE_ARAT)) + return 0; + for_each_online_cpu(cpu) { hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); } @@ -925,7 +945,7 @@ fs_initcall(hpet_late_init); void hpet_disable(void) { if (is_hpet_capable()) { - unsigned long cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG); if (hpet_legacy_int_enabled) { cfg &= ~HPET_CFG_LEGACY; @@ -965,8 +985,8 @@ static int hpet_prev_update_sec; static struct rtc_time hpet_alarm_time; static unsigned long hpet_pie_count; static u32 hpet_t1_cmp; -static unsigned long hpet_default_delta; -static unsigned long hpet_pie_delta; +static u32 hpet_default_delta; +static u32 hpet_pie_delta; static unsigned long hpet_pie_limit; static rtc_irq_handler irq_handler; @@ -1017,7 +1037,8 @@ EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler); */ int hpet_rtc_timer_init(void) { - unsigned long cfg, cnt, delta, flags; + unsigned int cfg, cnt, delta; + unsigned long flags; if (!is_hpet_enabled()) return 0; @@ -1027,7 +1048,7 @@ int hpet_rtc_timer_init(void) clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; - hpet_default_delta = (unsigned long) clc; + hpet_default_delta = clc; } if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) @@ -1113,7 +1134,7 @@ int hpet_set_periodic_freq(unsigned long freq) clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; do_div(clc, freq); clc >>= hpet_clockevent.shift; - hpet_pie_delta = (unsigned long) clc; + hpet_pie_delta = clc; } return 1; } @@ -1127,7 +1148,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq); static void hpet_rtc_timer_reinit(void) { - unsigned long cfg, delta; + unsigned int cfg, delta; int lost_ints = -1; if (unlikely(!hpet_rtc_flags)) { diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index fee6cc2b207..664bcb7384a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -18,7 +18,7 @@ atomic_t irq_err_count; /* Function pointer for generic interrupt vector handling */ -void (*generic_interrupt_extension)(void) = NULL; +void (*x86_platform_ipi_callback)(void) = NULL; /* * 'what should we do if we get a hw irq event on an illegal vector'. @@ -72,10 +72,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); seq_printf(p, " Performance pending work\n"); #endif - if (generic_interrupt_extension) { + if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); + seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); seq_printf(p, " Platform interrupts\n"); } #ifdef CONFIG_SMP @@ -187,8 +187,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_perf_irqs; sum += irq_stats(cpu)->apic_pending_irqs; #endif - if (generic_interrupt_extension) - sum += irq_stats(cpu)->generic_irqs; + if (x86_platform_ipi_callback) + sum += irq_stats(cpu)->x86_platform_ipis; #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; @@ -251,9 +251,9 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) } /* - * Handler for GENERIC_INTERRUPT_VECTOR. + * Handler for X86_PLATFORM_IPI_VECTOR. */ -void smp_generic_interrupt(struct pt_regs *regs) +void smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -263,10 +263,10 @@ void smp_generic_interrupt(struct pt_regs *regs) irq_enter(); - inc_irq_stat(generic_irqs); + inc_irq_stat(x86_platform_ipis); - if (generic_interrupt_extension) - generic_interrupt_extension(); + if (x86_platform_ipi_callback) + x86_platform_ipi_callback(); irq_exit(); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 40f30773fb2..d5932226614 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -200,8 +200,8 @@ static void __init apic_intr_init(void) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - /* generic IPI for platform specific use */ - alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); + /* IPI for X86 platform specific use */ + alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 20a5b368946..dd74fe7273b 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -86,9 +86,15 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_DS] = regs->ds; gdb_regs[GDB_ES] = regs->es; gdb_regs[GDB_CS] = regs->cs; - gdb_regs[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; + if (user_mode_vm(regs)) { + gdb_regs[GDB_SS] = regs->ss; + gdb_regs[GDB_SP] = regs->sp; + } else { + gdb_regs[GDB_SS] = __KERNEL_DS; + gdb_regs[GDB_SP] = kernel_stack_pointer(regs); + } #else gdb_regs[GDB_R8] = regs->r8; gdb_regs[GDB_R9] = regs->r9; @@ -101,8 +107,8 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs32[GDB_PS] = regs->flags; gdb_regs32[GDB_CS] = regs->cs; gdb_regs32[GDB_SS] = regs->ss; -#endif gdb_regs[GDB_SP] = kernel_stack_pointer(regs); +#endif } /** @@ -220,8 +226,7 @@ static void kgdb_correct_hw_break(void) dr7 |= ((breakinfo[breakno].len << 2) | breakinfo[breakno].type) << ((breakno << 2) + 16); - if (breakno >= 0 && breakno <= 3) - set_debugreg(breakinfo[breakno].addr, breakno); + set_debugreg(breakinfo[breakno].addr, breakno); } else { if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { @@ -395,7 +400,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, /* set the trace bit if we're stepping */ if (remcomInBuffer[0] == 's') { linux_regs->flags |= X86_EFLAGS_TF; - kgdb_single_step = 1; atomic_set(&kgdb_cpu_doing_single_step, raw_smp_processor_id()); } diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 1f3186ce213..5b8c7505b3b 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -481,7 +481,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they - * remain disabled thorough out this function. + * remain disabled throughout this function. */ static int __kprobes kprobe_handler(struct pt_regs *regs) { @@ -818,7 +818,7 @@ no_change: /* * Interrupts are disabled on entry as trap1 is an interrupt gate and they - * remain disabled thoroughout this function. + * remain disabled throughout this function. */ static int __kprobes post_kprobe_handler(struct pt_regs *regs) { diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index c843f8406da..a3fa43ba5d3 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -158,8 +158,7 @@ int machine_kexec_prepare(struct kimage *image) { int error; - if (nx_enabled) - set_pages_x(image->control_code_page, 1); + set_pages_x(image->control_code_page, 1); error = machine_kexec_alloc_page_tables(image); if (error) return error; @@ -173,8 +172,7 @@ int machine_kexec_prepare(struct kimage *image) */ void machine_kexec_cleanup(struct kimage *image) { - if (nx_enabled) - set_pages_nx(image->control_code_page, 1); + set_pages_nx(image->control_code_page, 1); machine_kexec_free_page_tables(image); } diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index f4c538b681c..63123d90210 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -33,6 +33,9 @@ MODULE_LICENSE("GPL v2"); #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 #define UCODE_UCODE_TYPE 0x00000001 +const struct firmware *firmware; +static int supported_cpu; + struct equiv_cpu_entry { u32 installed_cpu; u32 fixed_errata_mask; @@ -71,17 +74,14 @@ static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { - struct cpuinfo_x86 *c = &cpu_data(cpu); u32 dummy; - memset(csig, 0, sizeof(*csig)); - if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not " - "supported\n", cpu, c->x86); + if (!supported_cpu) return -1; - } + + memset(csig, 0, sizeof(*csig)); rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); - printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); + pr_info("microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; } @@ -103,22 +103,15 @@ static int get_matching_microcode(int cpu, void *mc, int rev) i++; } - if (!equiv_cpu_id) { - printk(KERN_WARNING "microcode: CPU%d: cpu revision " - "not listed in equivalent cpu table\n", cpu); + if (!equiv_cpu_id) return 0; - } - if (mc_header->processor_rev_id != equiv_cpu_id) { - printk(KERN_ERR "microcode: CPU%d: patch mismatch " - "(processor_rev_id: %x, equiv_cpu_id: %x)\n", - cpu, mc_header->processor_rev_id, equiv_cpu_id); + if (mc_header->processor_rev_id != equiv_cpu_id) return 0; - } /* ucode might be chipset specific -- currently we don't support this */ if (mc_header->nb_dev_id || mc_header->sb_dev_id) { - printk(KERN_ERR "microcode: CPU%d: loading of chipset " + pr_err(KERN_ERR "microcode: CPU%d: loading of chipset " "specific code not yet supported\n", cpu); return 0; } @@ -148,14 +141,12 @@ static int apply_microcode_amd(int cpu) /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { - printk(KERN_ERR "microcode: CPU%d: update failed " + pr_err("microcode: CPU%d: update failed " "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); return -1; } - printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", - cpu, rev); - + pr_info("microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); uci->cpu_sig.rev = rev; return 0; @@ -178,18 +169,15 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) return NULL; if (section_hdr[0] != UCODE_UCODE_TYPE) { - printk(KERN_ERR "microcode: error: invalid type field in " + pr_err("microcode: error: invalid type field in " "container file section header\n"); return NULL; } total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); - printk(KERN_DEBUG "microcode: size %u, total_size %u\n", - size, total_size); - if (total_size > size || total_size > UCODE_MAX_SIZE) { - printk(KERN_ERR "microcode: error: size mismatch\n"); + pr_err("microcode: error: size mismatch\n"); return NULL; } @@ -218,15 +206,14 @@ static int install_equiv_cpu_table(const u8 *buf) size = buf_pos[2]; if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - printk(KERN_ERR "microcode: error: invalid type field in " + pr_err("microcode: error: invalid type field in " "container file section header\n"); return 0; } equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); if (!equiv_cpu_table) { - printk(KERN_ERR "microcode: failed to allocate " - "equivalent CPU table\n"); + pr_err("microcode: failed to allocate equivalent CPU table\n"); return 0; } @@ -259,8 +246,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { - printk(KERN_ERR "microcode: failed to create " - "equivalent cpu table\n"); + pr_err("microcode: failed to create equivalent cpu table\n"); return UCODE_ERROR; } @@ -308,33 +294,27 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) static enum ucode_state request_microcode_fw(int cpu, struct device *device) { - const char *fw_name = "amd-ucode/microcode_amd.bin"; - const struct firmware *firmware; enum ucode_state ret; - if (request_firmware(&firmware, fw_name, device)) { - printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); + if (firmware == NULL) return UCODE_NFOUND; - } if (*(u32 *)firmware->data != UCODE_MAGIC) { - printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n", + pr_err("microcode: invalid UCODE_MAGIC (0x%08x)\n", *(u32 *)firmware->data); return UCODE_ERROR; } ret = generic_load_microcode(cpu, firmware->data, firmware->size); - release_firmware(firmware); - return ret; } static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { - printk(KERN_INFO "microcode: AMD microcode update via " - "/dev/cpu/microcode not supported\n"); + pr_info("microcode: AMD microcode update via " + "/dev/cpu/microcode not supported\n"); return UCODE_ERROR; } @@ -346,7 +326,32 @@ static void microcode_fini_cpu_amd(int cpu) uci->mc = NULL; } +void init_microcode_amd(struct device *device) +{ + const char *fw_name = "amd-ucode/microcode_amd.bin"; + struct cpuinfo_x86 *c = &boot_cpu_data; + + WARN_ON(c->x86_vendor != X86_VENDOR_AMD); + + if (c->x86 < 0x10) { + pr_warning("microcode: AMD CPU family 0x%x not supported\n", + c->x86); + return; + } + supported_cpu = 1; + + if (request_firmware(&firmware, fw_name, device)) + pr_err("microcode: failed to load file %s\n", fw_name); +} + +void fini_microcode_amd(void) +{ + release_firmware(firmware); +} + static struct microcode_ops microcode_amd_ops = { + .init = init_microcode_amd, + .fini = fini_microcode_amd, .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info_amd, diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 2bcad3926ed..e68aae39786 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -391,7 +391,7 @@ static enum ucode_state microcode_update_cpu(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; enum ucode_state ustate; - if (uci->valid) + if (uci->valid && uci->mc) ustate = microcode_resume_cpu(cpu); else ustate = microcode_init_cpu(cpu); @@ -518,6 +518,9 @@ static int __init microcode_init(void) return PTR_ERR(microcode_pdev); } + if (microcode_ops->init) + microcode_ops->init(µcode_pdev->dev); + get_online_cpus(); mutex_lock(µcode_mutex); @@ -561,6 +564,9 @@ static void __exit microcode_exit(void) platform_device_unregister(microcode_pdev); + if (microcode_ops->fini) + microcode_ops->fini(); + microcode_ops = NULL; pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 5be95ef4ffe..35a57c963df 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -667,36 +667,18 @@ void __init default_get_smp_config(unsigned int early) */ } -static void __init smp_reserve_bootmem(struct mpf_intel *mpf) +static void __init smp_reserve_memory(struct mpf_intel *mpf) { unsigned long size = get_mpc_size(mpf->physptr); -#ifdef CONFIG_X86_32 - /* - * We cannot access to MPC table to compute table size yet, - * as only few megabytes from the bottom is mapped now. - * PC-9800's MPC table places on the very last of physical - * memory; so that simply reserving PAGE_SIZE from mpf->physptr - * yields BUG() in reserve_bootmem. - * also need to make sure physptr is below than max_low_pfn - * we don't need reserve the area above max_low_pfn - */ - unsigned long end = max_low_pfn * PAGE_SIZE; - if (mpf->physptr < end) { - if (mpf->physptr + size > end) - size = end - mpf->physptr; - reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); - } -#else - reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); -#endif + reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc"); } -static int __init smp_scan_config(unsigned long base, unsigned long length, - unsigned reserve) +static int __init smp_scan_config(unsigned long base, unsigned long length) { unsigned int *bp = phys_to_virt(base); struct mpf_intel *mpf; + unsigned long mem; apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", bp, length); @@ -717,12 +699,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", mpf, (u64)virt_to_phys(mpf)); - if (!reserve) - return 1; - reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf), - BOOTMEM_DEFAULT); + mem = virt_to_phys(mpf); + reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf"); if (mpf->physptr) - smp_reserve_bootmem(mpf); + smp_reserve_memory(mpf); return 1; } @@ -732,7 +712,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, return 0; } -void __init default_find_smp_config(unsigned int reserve) +void __init default_find_smp_config(void) { unsigned int address; @@ -744,9 +724,9 @@ void __init default_find_smp_config(unsigned int reserve) * 2) Scan the top 1K of base RAM * 3) Scan the 64K of bios */ - if (smp_scan_config(0x0, 0x400, reserve) || - smp_scan_config(639 * 0x400, 0x400, reserve) || - smp_scan_config(0xF0000, 0x10000, reserve)) + if (smp_scan_config(0x0, 0x400) || + smp_scan_config(639 * 0x400, 0x400) || + smp_scan_config(0xF0000, 0x10000)) return; /* * If it is an SMP machine we should know now, unless the @@ -767,7 +747,7 @@ void __init default_find_smp_config(unsigned int reserve) address = get_bios_ebda(); if (address) - smp_scan_config(address, 0x400, reserve); + smp_scan_config(address, 0x400); } #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a98fe88fab6..c95c8f4e790 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -349,26 +349,42 @@ out: return err; } -void -start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +static void +start_thread_common(struct pt_regs *regs, unsigned long new_ip, + unsigned long new_sp, + unsigned int _cs, unsigned int _ss, unsigned int _ds) { loadsegment(fs, 0); - loadsegment(es, 0); - loadsegment(ds, 0); + loadsegment(es, _ds); + loadsegment(ds, _ds); load_gs_index(0); regs->ip = new_ip; regs->sp = new_sp; percpu_write(old_rsp, new_sp); - regs->cs = __USER_CS; - regs->ss = __USER_DS; - regs->flags = 0x200; + regs->cs = _cs; + regs->ss = _ss; + regs->flags = X86_EFLAGS_IF; set_fs(USER_DS); /* * Free the old FP and other extended state */ free_thread_xstate(current); } -EXPORT_SYMBOL_GPL(start_thread); + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + start_thread_common(regs, new_ip, new_sp, + __USER_CS, __USER_DS, 0); +} + +#ifdef CONFIG_IA32_EMULATION +void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) +{ + start_thread_common(regs, new_ip, new_sp, + __USER32_CS, __USER32_DS, __USER32_DS); +} +#endif /* * switch_to(x,y) should switch tasks from x to y. diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 6c3b2c6fd77..18093d7498f 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -499,6 +499,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev) { struct pci_dev *nb_ht; unsigned int devfn; + u32 node; u32 val; devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); @@ -507,7 +508,13 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev) return; pci_read_config_dword(nb_ht, 0x60, &val); - set_dev_node(&dev->dev, val & 7); + node = val & 7; + /* + * Some hardware may return an invalid node ID, + * so check it first: + */ + if (node_online(node)) + set_dev_node(&dev->dev, node); pci_dev_put(nb_ht); } diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index 61a837743fe..201eab63b05 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -80,6 +80,7 @@ void mach_reboot_fixups(void) continue; cur->reboot_fixup(dev); + pci_dev_put(dev); } } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 82e88cdda9b..946a311a25c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -106,6 +106,7 @@ #include <asm/percpu.h> #include <asm/topology.h> #include <asm/apicdef.h> +#include <asm/k8.h> #ifdef CONFIG_X86_64 #include <asm/numa_64.h> #endif @@ -487,42 +488,11 @@ static void __init reserve_early_setup_data(void) #ifdef CONFIG_KEXEC -/** - * Reserve @size bytes of crashkernel memory at any suitable offset. - * - * @size: Size of the crashkernel memory to reserve. - * Returns the base address on success, and -1ULL on failure. - */ -static -unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) -{ - const unsigned long long alignment = 16<<20; /* 16M */ - unsigned long long start = 0LL; - - while (1) { - int ret; - - start = find_e820_area(start, ULONG_MAX, size, alignment); - if (start == -1ULL) - return start; - - /* try to reserve it */ - ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE); - if (ret >= 0) - return start; - - start += alignment; - } -} - static inline unsigned long long get_total_mem(void) { unsigned long long total; - total = max_low_pfn - min_low_pfn; -#ifdef CONFIG_HIGHMEM - total += highend_pfn - highstart_pfn; -#endif + total = max_pfn - min_low_pfn; return total << PAGE_SHIFT; } @@ -542,21 +512,25 @@ static void __init reserve_crashkernel(void) /* 0 means: find the address automatically */ if (crash_base <= 0) { - crash_base = find_and_reserve_crashkernel(crash_size); + const unsigned long long alignment = 16<<20; /* 16M */ + + crash_base = find_e820_area(alignment, ULONG_MAX, crash_size, + alignment); if (crash_base == -1ULL) { - pr_info("crashkernel reservation failed. " - "No suitable area found.\n"); + pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } } else { - ret = reserve_bootmem_generic(crash_base, crash_size, - BOOTMEM_EXCLUSIVE); - if (ret < 0) { - pr_info("crashkernel reservation failed - " - "memory is in use\n"); + unsigned long long start; + + start = find_e820_area(crash_base, ULONG_MAX, crash_size, + 1<<20); + if (start != crash_base) { + pr_info("crashkernel reservation failed - memory is in use.\n"); return; } } + reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL"); printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " "for crashkernel (System RAM: %ldMB)\n", @@ -699,6 +673,9 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { void __init setup_arch(char **cmdline_p) { + int acpi = 0; + int k8 = 0; + #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); @@ -791,21 +768,18 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; -#ifdef CONFIG_X86_64 /* - * Must call this twice: Once just to detect whether hardware doesn't - * support NX (so that the early EHCI debug console setup can safely - * call set_fixmap(), and then again after parsing early parameters to - * honor the respective command line option. + * x86_configure_nx() is called before parse_early_param() to detect + * whether hardware doesn't support NX (so that the early EHCI debug + * console setup can safely call set_fixmap()). It may then be called + * again from within noexec_setup() during parsing early parameters + * to honor the respective command line option. */ - check_efer(); -#endif + x86_configure_nx(); parse_early_param(); -#ifdef CONFIG_X86_64 - check_efer(); -#endif + x86_report_nx(); /* Must be before kernel pagetables are setup */ vmi_activate(); @@ -901,6 +875,13 @@ void __init setup_arch(char **cmdline_p) reserve_brk(); +#ifdef CONFIG_ACPI_SLEEP + /* + * Reserve low memory region for sleep support. + * even before init_memory_mapping + */ + acpi_reserve_wakeup_memory(); +#endif init_gbpages(); /* max_pfn_mapped is updated here */ @@ -927,6 +908,8 @@ void __init setup_arch(char **cmdline_p) reserve_initrd(); + reserve_crashkernel(); + vsmp_init(); io_delay_init(); @@ -938,27 +921,24 @@ void __init setup_arch(char **cmdline_p) early_acpi_boot_init(); + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); + #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. */ - acpi_numa_init(); + acpi = acpi_numa_init(); #endif - initmem_init(0, max_pfn); - -#ifdef CONFIG_ACPI_SLEEP - /* - * Reserve low memory region for sleep support. - */ - acpi_reserve_bootmem(); +#ifdef CONFIG_K8_NUMA + if (!acpi) + k8 = !k8_numa_init(0, max_pfn); #endif - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); - reserve_crashkernel(); + initmem_init(0, max_pfn, acpi, k8); #ifdef CONFIG_X86_64 /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 324f2a44c22..29e6744f51e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -687,7 +687,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; - INIT_WORK(&c_idle.work, do_fork_idle); + INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle); alternatives_smp_switch(1); @@ -713,6 +713,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) if (IS_ERR(c_idle.idle)) { printk("failed fork for CPU %d\n", cpu); + destroy_work_on_stack(&c_idle.work); return PTR_ERR(c_idle.idle); } @@ -831,6 +832,7 @@ do_rest: smpboot_restore_warm_reset_vector(); } + destroy_work_on_stack(&c_idle.work); return boot_error; } diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 1884a8d12bf..dee1ff7cba5 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -24,31 +24,6 @@ #include <asm/syscalls.h> -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file *file = NULL; - struct mm_struct *mm = current->mm; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(&mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(&mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux/i386 didn't use to be able to handle more than @@ -77,7 +52,7 @@ asmlinkage int old_mmap(struct mmap_arg_struct __user *arg) if (a.offset & ~PAGE_MASK) goto out; - err = sys_mmap2(a.addr, a.len, a.prot, a.flags, + err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); out: return err; diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 45e00eb09c3..8aa2057efd1 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -23,26 +23,11 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, fd, unsigned long, off) { long error; - struct file *file; - error = -EINVAL; if (off & ~PAGE_MASK) goto out; - error = -EBADF; - file = NULL; - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); + error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return error; } diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 70c2125d55b..15228b5d3eb 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -191,7 +191,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* reserved for streams2 */ .long ptregs_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 + .long sys_mmap_pgoff .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 1740c85e24b..364d015efeb 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -817,10 +817,8 @@ static int __init uv_init_blade(int blade) */ apicid = blade_to_first_apicid(blade); pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); - if ((pa & 0xff) != UV_BAU_MESSAGE) { - uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, + uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, ((apicid << 32) | UV_BAU_MESSAGE)); - } return 0; } diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 583f11d5c48..3c84aa001c1 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -74,7 +74,7 @@ struct uv_rtc_timer_head { */ static struct uv_rtc_timer_head **blade_info __read_mostly; -static int uv_rtc_enable; +static int uv_rtc_evt_enable; /* * Hardware interface routines @@ -90,7 +90,7 @@ static void uv_rtc_send_IPI(int cpu) pnode = uv_apicid_to_pnode(apicid); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (apicid << UVH_IPI_INT_APIC_ID_SHFT) | - (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); + (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT); uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } @@ -115,7 +115,7 @@ static int uv_setup_intr(int cpu, u64 expires) uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, UVH_EVENT_OCCURRED0_RTC1_MASK); - val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | + val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); /* Set configuration */ @@ -123,7 +123,10 @@ static int uv_setup_intr(int cpu, u64 expires) /* Initialize comparator value */ uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); - return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode)); + if (uv_read_rtc(NULL) <= expires) + return 0; + + return !uv_intr_pending(pnode); } /* @@ -223,6 +226,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires) next_cpu = head->next_cpu; *t = expires; + /* Will this one be next to go off? */ if (next_cpu < 0 || bcpu == next_cpu || expires < head->cpu[next_cpu].expires) { @@ -231,7 +235,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires) *t = ULLONG_MAX; uv_rtc_find_next_timer(head, pnode); spin_unlock_irqrestore(&head->lock, flags); - return 1; + return -ETIME; } } @@ -244,7 +248,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires) * * Returns 1 if this timer was pending. */ -static int uv_rtc_unset_timer(int cpu) +static int uv_rtc_unset_timer(int cpu, int force) { int pnode = uv_cpu_to_pnode(cpu); int bid = uv_cpu_to_blade_id(cpu); @@ -256,14 +260,15 @@ static int uv_rtc_unset_timer(int cpu) spin_lock_irqsave(&head->lock, flags); - if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) + if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force) rc = 1; - *t = ULLONG_MAX; - - /* Was the hardware setup for this timer? */ - if (head->next_cpu == bcpu) - uv_rtc_find_next_timer(head, pnode); + if (rc) { + *t = ULLONG_MAX; + /* Was the hardware setup for this timer? */ + if (head->next_cpu == bcpu) + uv_rtc_find_next_timer(head, pnode); + } spin_unlock_irqrestore(&head->lock, flags); @@ -310,32 +315,32 @@ static void uv_rtc_timer_setup(enum clock_event_mode mode, break; case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: - uv_rtc_unset_timer(ced_cpu); + uv_rtc_unset_timer(ced_cpu, 1); break; } } static void uv_rtc_interrupt(void) { - struct clock_event_device *ced = &__get_cpu_var(cpu_ced); int cpu = smp_processor_id(); + struct clock_event_device *ced = &per_cpu(cpu_ced, cpu); if (!ced || !ced->event_handler) return; - if (uv_rtc_unset_timer(cpu) != 1) + if (uv_rtc_unset_timer(cpu, 0) != 1) return; ced->event_handler(ced); } -static int __init uv_enable_rtc(char *str) +static int __init uv_enable_evt_rtc(char *str) { - uv_rtc_enable = 1; + uv_rtc_evt_enable = 1; return 1; } -__setup("uvrtc", uv_enable_rtc); +__setup("uvrtcevt", uv_enable_evt_rtc); static __init void uv_rtc_register_clockevents(struct work_struct *dummy) { @@ -350,27 +355,32 @@ static __init int uv_rtc_setup_clock(void) { int rc; - if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) + if (!is_uv_system()) return -ENODEV; - generic_interrupt_extension = uv_rtc_interrupt; - clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, clocksource_uv.shift); + /* If single blade, prefer tsc */ + if (uv_num_possible_blades() == 1) + clocksource_uv.rating = 250; + rc = clocksource_register(&clocksource_uv); - if (rc) { - generic_interrupt_extension = NULL; + if (rc) + printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); + else + printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n", + sn_rtc_cycles_per_second/(unsigned long)1E6); + + if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback) return rc; - } /* Setup and register clockevents */ rc = uv_rtc_allocate_timers(); - if (rc) { - clocksource_unregister(&clocksource_uv); - generic_interrupt_extension = NULL; - return rc; - } + if (rc) + goto error; + + x86_platform_ipi_callback = uv_rtc_interrupt; clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, NSEC_PER_SEC, clock_event_device_uv.shift); @@ -383,11 +393,19 @@ static __init int uv_rtc_setup_clock(void) rc = schedule_on_each_cpu(uv_rtc_register_clockevents); if (rc) { - clocksource_unregister(&clocksource_uv); - generic_interrupt_extension = NULL; + x86_platform_ipi_callback = NULL; uv_rtc_deallocate_timers(); + goto error; } + printk(KERN_INFO "UV RTC clockevents registered\n"); + + return 0; + +error: + clocksource_unregister(&clocksource_uv); + printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc); + return rc; } arch_initcall(uv_rtc_setup_clock); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index abda6f53e71..34a279a7471 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -197,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) apic_version[m->apicid] = ver; } -static void __init visws_find_smp_config(unsigned int reserve) +static void __init visws_find_smp_config(void) { struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 611b9e2360d..74c92bb194d 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void) evt->min_delta_ns = clockevent_delta2ns(1, evt); evt->cpumask = cpumask_of(cpu); - printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", + printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n", evt->name, evt->mult, evt->shift); clockevents_register_device(evt); } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 3c68fe2d46c..f3f2104408d 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -41,6 +41,32 @@ ENTRY(phys_startup_64) jiffies_64 = jiffies; #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +/* + * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA + * we retain large page mappings for boundaries spanning kernel text, rodata + * and data sections. + * + * However, kernel identity mappings will have different RWX permissions + * to the pages mapping to text and to the pages padding (which are freed) the + * text section. Hence kernel identity mappings will be broken to smaller + * pages. For 64-bit, kernel text and kernel identity mappings are different, + * so we can enable protection checks that come with CONFIG_DEBUG_RODATA, + * as well as retain 2MB large page mappings for kernel text. + */ +#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); + +#define X64_ALIGN_DEBUG_RODATA_END \ + . = ALIGN(HPAGE_SIZE); \ + __end_rodata_hpage_align = .; + +#else + +#define X64_ALIGN_DEBUG_RODATA_BEGIN +#define X64_ALIGN_DEBUG_RODATA_END + +#endif + PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ @@ -90,7 +116,9 @@ SECTIONS EXCEPTION_TABLE(16) :text = 0x9090 + X64_ALIGN_DEBUG_RODATA_BEGIN RO_DATA(PAGE_SIZE) + X64_ALIGN_DEBUG_RODATA_END /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { @@ -107,13 +135,13 @@ SECTIONS PAGE_ALIGNED_DATA(PAGE_SIZE) - CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) + CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES) DATA_DATA CONSTRUCTORS /* rarely changed data like cpu maps */ - READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) + READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES) /* End of data section */ _edata = .; @@ -137,12 +165,12 @@ SECTIONS *(.vsyscall_0) } :user - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(L1_CACHE_BYTES); .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(L1_CACHE_BYTES); .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) } @@ -166,7 +194,7 @@ SECTIONS } vgetcpu_mode = VVIRT(.vgetcpu_mode); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(L1_CACHE_BYTES); .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index e02d92d12bc..9055e5872ff 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -73,7 +73,8 @@ void update_vsyscall_tz(void) write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) +void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, + u32 mult) { unsigned long flags; @@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) vsyscall_gtod_data.clock.vread = clock->vread; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; vsyscall_gtod_data.clock.mask = clock->mask; - vsyscall_gtod_data.clock.mult = clock->mult; + vsyscall_gtod_data.clock.mult = mult; vsyscall_gtod_data.clock.shift = clock->shift; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index d11c5ff7c65..ccd179dec36 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -13,6 +13,7 @@ #include <asm/e820.h> #include <asm/time.h> #include <asm/irq.h> +#include <asm/pat.h> #include <asm/tsc.h> #include <asm/iommu.h> @@ -80,4 +81,5 @@ struct x86_platform_ops x86_platform = { .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, .iommu_shutdown = iommu_shutdown_noop, + .is_untracked_pat_range = is_ISA_range, }; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 73ffd5536f6..d406c523901 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -146,10 +146,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, use_gbpages = direct_gbpages; #endif - set_nx(); - if (nx_enabled) - printk(KERN_INFO "NX (Execute Disable) protection: active\n"); - /* Enable PSE if available */ if (cpu_has_pse) set_in_cr4(X86_CR4_PSE); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 30938c1d8d5..c973f8e2a6c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -412,7 +412,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -static void __init add_one_highpage_init(struct page *page, int pfn) +static void __init add_one_highpage_init(struct page *page) { ClearPageReserved(page); init_page_count(page); @@ -445,7 +445,7 @@ static int __init add_highpages_work_fn(unsigned long start_pfn, if (!pfn_valid(node_pfn)) continue; page = pfn_to_page(node_pfn); - add_one_highpage_init(page, node_pfn); + add_one_highpage_init(page); } return 0; @@ -703,8 +703,8 @@ void __init find_low_pfn_range(void) } #ifndef CONFIG_NEED_MULTIPLE_NODES -void __init initmem_init(unsigned long start_pfn, - unsigned long end_pfn) +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, + int acpi, int k8) { #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; @@ -997,7 +997,7 @@ static noinline int do_test_wp_bit(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); -static int kernel_set_to_readonly; +int kernel_set_to_readonly __read_mostly; void set_kernel_text_rw(void) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5a4398a6006..5198b9bb34e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -568,7 +568,8 @@ kernel_physical_mapping_init(unsigned long start, } #ifndef CONFIG_NUMA -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, + int acpi, int k8) { unsigned long bootmap_size, bootmap; @@ -694,12 +695,12 @@ void __init mem_init(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); -static int kernel_set_to_readonly; +int kernel_set_to_readonly; void set_kernel_text_rw(void) { - unsigned long start = PFN_ALIGN(_stext); - unsigned long end = PFN_ALIGN(__start_rodata); + unsigned long start = PFN_ALIGN(_text); + unsigned long end = PFN_ALIGN(__stop___ex_table); if (!kernel_set_to_readonly) return; @@ -707,13 +708,18 @@ void set_kernel_text_rw(void) pr_debug("Set kernel text: %lx - %lx for read write\n", start, end); + /* + * Make the kernel identity mapping for text RW. Kernel text + * mapping will always be RO. Refer to the comment in + * static_protections() in pageattr.c + */ set_memory_rw(start, (end - start) >> PAGE_SHIFT); } void set_kernel_text_ro(void) { - unsigned long start = PFN_ALIGN(_stext); - unsigned long end = PFN_ALIGN(__start_rodata); + unsigned long start = PFN_ALIGN(_text); + unsigned long end = PFN_ALIGN(__stop___ex_table); if (!kernel_set_to_readonly) return; @@ -721,14 +727,21 @@ void set_kernel_text_ro(void) pr_debug("Set kernel text: %lx - %lx for read only\n", start, end); + /* + * Set the kernel identity mapping for text RO. + */ set_memory_ro(start, (end - start) >> PAGE_SHIFT); } void mark_rodata_ro(void) { - unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); + unsigned long start = PFN_ALIGN(_text); unsigned long rodata_start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + unsigned long end = (unsigned long) &__end_rodata_hpage_align; + unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); + unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); + unsigned long data_start = (unsigned long) &_sdata; printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -751,6 +764,14 @@ void mark_rodata_ro(void) printk(KERN_INFO "Testing CPA: again\n"); set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif + + free_init_pages("unused kernel memory", + (unsigned long) page_address(virt_to_page(text_end)), + (unsigned long) + page_address(virt_to_page(rodata_start))); + free_init_pages("unused kernel memory", + (unsigned long) page_address(virt_to_page(rodata_end)), + (unsigned long) page_address(virt_to_page(data_start))); } #endif diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 2feb9bdedaa..c246d259822 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -281,30 +281,6 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) } EXPORT_SYMBOL(ioremap_cache); -static void __iomem *ioremap_default(resource_size_t phys_addr, - unsigned long size) -{ - unsigned long flags; - void __iomem *ret; - int err; - - /* - * - WB for WB-able memory and no other conflicting mappings - * - UC_MINUS for non-WB-able memory with no other conflicting mappings - * - Inherit from confliting mappings otherwise - */ - err = reserve_memtype(phys_addr, phys_addr + size, - _PAGE_CACHE_WB, &flags); - if (err < 0) - return NULL; - - ret = __ioremap_caller(phys_addr, size, flags, - __builtin_return_address(0)); - - free_memtype(phys_addr, phys_addr + size); - return ret; -} - void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, unsigned long prot_val) { @@ -380,7 +356,7 @@ void *xlate_dev_mem_ptr(unsigned long phys) if (page_is_ram(start >> PAGE_SHIFT)) return __va(phys); - addr = (void __force *)ioremap_default(start, PAGE_SIZE); + addr = (void __force *)ioremap_cache(start, PAGE_SIZE); if (addr) addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 268f8255280..970ed579d4e 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -24,6 +24,9 @@ #include <asm/apic.h> #include <asm/k8.h> +static struct bootnode __initdata nodes[8]; +static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; + static __init int find_northbridge(void) { int num; @@ -54,18 +57,6 @@ static __init void early_get_boot_cpu_id(void) * need to get boot_cpu_id so can use that to create apicid_to_node * in k8_scan_nodes() */ - /* - * Find possible boot-time SMP configuration: - */ -#ifdef CONFIG_X86_MPPARSE - early_find_smp_config(); -#endif -#ifdef CONFIG_ACPI - /* - * Read APIC information from ACPI tables. - */ - early_acpi_boot_init(); -#endif #ifdef CONFIG_X86_MPPARSE /* * get boot-time SMP configuration: @@ -76,12 +67,26 @@ static __init void early_get_boot_cpu_id(void) early_init_lapic_mapping(); } -int __init k8_scan_nodes(unsigned long start, unsigned long end) +int __init k8_get_nodes(struct bootnode *physnodes) { - unsigned numnodes, cores, bits, apicid_base; + int i; + int ret = 0; + + for_each_node_mask(i, nodes_parsed) { + physnodes[ret].start = nodes[i].start; + physnodes[ret].end = nodes[i].end; + ret++; + } + return ret; +} + +int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long start = PFN_PHYS(start_pfn); + unsigned long end = PFN_PHYS(end_pfn); + unsigned numnodes; unsigned long prevbase; - struct bootnode nodes[8]; - int i, j, nb, found = 0; + int i, nb, found = 0; u32 nodeid, reg; if (!early_pci_allowed()) @@ -91,16 +96,15 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) if (nb < 0) return nb; - printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); + pr_info("Scanning NUMA topology in Northbridge %d\n", nb); reg = read_pci_config(0, nb, 0, 0x60); numnodes = ((reg >> 4) & 0xF) + 1; if (numnodes <= 1) return -1; - printk(KERN_INFO "Number of nodes %d\n", numnodes); + pr_info("Number of physical nodes %d\n", numnodes); - memset(&nodes, 0, sizeof(nodes)); prevbase = 0; for (i = 0; i < 8; i++) { unsigned long base, limit; @@ -111,28 +115,28 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) nodeid = limit & 7; if ((base & 3) == 0) { if (i < numnodes) - printk("Skipping disabled node %d\n", i); + pr_info("Skipping disabled node %d\n", i); continue; } if (nodeid >= numnodes) { - printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, - base, limit); + pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid, + base, limit); continue; } if (!limit) { - printk(KERN_INFO "Skipping node entry %d (base %lx)\n", - i, base); + pr_info("Skipping node entry %d (base %lx)\n", + i, base); continue; } if ((base >> 8) & 3 || (limit >> 8) & 3) { - printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", - nodeid, (base>>8)&3, (limit>>8) & 3); + pr_err("Node %d using interleaving mode %lx/%lx\n", + nodeid, (base >> 8) & 3, (limit >> 8) & 3); return -1; } - if (node_isset(nodeid, node_possible_map)) { - printk(KERN_INFO "Node %d already present. Skipping\n", - nodeid); + if (node_isset(nodeid, nodes_parsed)) { + pr_info("Node %d already present, skipping\n", + nodeid); continue; } @@ -141,8 +145,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) limit |= (1<<24)-1; limit++; - if (limit > max_pfn << PAGE_SHIFT) - limit = max_pfn << PAGE_SHIFT; + if (limit > end) + limit = end; if (limit <= base) continue; @@ -154,24 +158,24 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) if (limit > end) limit = end; if (limit == base) { - printk(KERN_ERR "Empty node %d\n", nodeid); + pr_err("Empty node %d\n", nodeid); continue; } if (limit < base) { - printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", + pr_err("Node %d bogus settings %lx-%lx.\n", nodeid, base, limit); continue; } /* Could sort here, but pun for now. Should not happen anyroads. */ if (prevbase > base) { - printk(KERN_ERR "Node map not sorted %lx,%lx\n", + pr_err("Node map not sorted %lx,%lx\n", prevbase, base); return -1; } - printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", - nodeid, base, limit); + pr_info("Node %d MemBase %016lx Limit %016lx\n", + nodeid, base, limit); found++; @@ -180,18 +184,29 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) prevbase = base; - node_set(nodeid, node_possible_map); + node_set(nodeid, nodes_parsed); } if (!found) return -1; + return 0; +} + +int __init k8_scan_nodes(void) +{ + unsigned int bits; + unsigned int cores; + unsigned int apicid_base; + int i; + BUG_ON(nodes_empty(nodes_parsed)); + node_possible_map = nodes_parsed; memnode_shift = compute_hash_shift(nodes, 8, NULL); if (memnode_shift < 0) { - printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); + pr_err("No NUMA node hash function found. Contact maintainer\n"); return -1; } - printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); + pr_info("Using node hash shift of %d\n", memnode_shift); /* use the coreid bits from early_identify_cpu */ bits = boot_cpu_data.x86_coreid_bits; @@ -200,14 +215,12 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) /* need to get boot_cpu_id early for system with apicid lifting */ early_get_boot_cpu_id(); if (boot_cpu_physical_apicid > 0) { - printk(KERN_INFO "BSP APIC ID: %02x\n", - boot_cpu_physical_apicid); + pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); apicid_base = boot_cpu_physical_apicid; } - for (i = 0; i < 8; i++) { - if (nodes[i].start == nodes[i].end) - continue; + for_each_node_mask(i, node_possible_map) { + int j; e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 11a4ad4d625..07bcc309cfd 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -203,7 +203,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) */ /* * Interrupts are disabled on entry as trap3 is an interrupt gate - * and they remain disabled thorough out this function. + * and they remain disabled throughout this function. */ int kmmio_handler(struct pt_regs *regs, unsigned long addr) { @@ -302,7 +302,7 @@ no_kmmio: /* * Interrupts are disabled on entry as trap1 is an interrupt gate - * and they remain disabled thorough out this function. + * and they remain disabled throughout this function. * This must always get called as the pair to kmmio_handler(). */ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index d2530062fe0..b20760ca724 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -347,8 +347,8 @@ static void init_remap_allocator(int nid) (ulong) node_remap_end_vaddr[nid]); } -void __init initmem_init(unsigned long start_pfn, - unsigned long end_pfn) +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, + int acpi, int k8) { int nid; long kva_target_pfn; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 459913beac7..83bbc70d11b 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -239,8 +239,14 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) bootmap = early_node_mem(nodeid, bootmap_start, end, bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); if (bootmap == NULL) { - if (nodedata_phys < start || nodedata_phys >= end) - free_bootmem(nodedata_phys, pgdat_size); + if (nodedata_phys < start || nodedata_phys >= end) { + /* + * only need to free it if it is from other node + * bootmem + */ + if (nid != nodeid) + free_bootmem(nodedata_phys, pgdat_size); + } node_data[nodeid] = NULL; return; } @@ -306,8 +312,71 @@ void __init numa_init_array(void) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ +static struct bootnode nodes[MAX_NUMNODES] __initdata; +static struct bootnode physnodes[MAX_NUMNODES] __initdata; static char *cmdline __initdata; +static int __init setup_physnodes(unsigned long start, unsigned long end, + int acpi, int k8) +{ + int nr_nodes = 0; + int ret = 0; + int i; + +#ifdef CONFIG_ACPI_NUMA + if (acpi) + nr_nodes = acpi_get_nodes(physnodes); +#endif +#ifdef CONFIG_K8_NUMA + if (k8) + nr_nodes = k8_get_nodes(physnodes); +#endif + /* + * Basic sanity checking on the physical node map: there may be errors + * if the SRAT or K8 incorrectly reported the topology or the mem= + * kernel parameter is used. + */ + for (i = 0; i < nr_nodes; i++) { + if (physnodes[i].start == physnodes[i].end) + continue; + if (physnodes[i].start > end) { + physnodes[i].end = physnodes[i].start; + continue; + } + if (physnodes[i].end < start) { + physnodes[i].start = physnodes[i].end; + continue; + } + if (physnodes[i].start < start) + physnodes[i].start = start; + if (physnodes[i].end > end) + physnodes[i].end = end; + } + + /* + * Remove all nodes that have no memory or were truncated because of the + * limited address range. + */ + for (i = 0; i < nr_nodes; i++) { + if (physnodes[i].start == physnodes[i].end) + continue; + physnodes[ret].start = physnodes[i].start; + physnodes[ret].end = physnodes[i].end; + ret++; + } + + /* + * If no physical topology was detected, a single node is faked to cover + * the entire address space. + */ + if (!ret) { + physnodes[ret].start = start; + physnodes[ret].end = end; + ret = 1; + } + return ret; +} + /* * Setups up nid to range from addr to addr + size. If the end * boundary is greater than max_addr, then max_addr is used instead. @@ -315,11 +384,9 @@ static char *cmdline __initdata; * allocation past addr and -1 otherwise. addr is adjusted to be at * the end of the node. */ -static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, - u64 size, u64 max_addr) +static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) { int ret = 0; - nodes[nid].start = *addr; *addr += size; if (*addr >= max_addr) { @@ -335,12 +402,111 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, } /* + * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr + * to max_addr. The return value is the number of nodes allocated. + */ +static int __init split_nodes_interleave(u64 addr, u64 max_addr, + int nr_phys_nodes, int nr_nodes) +{ + nodemask_t physnode_mask = NODE_MASK_NONE; + u64 size; + int big; + int ret = 0; + int i; + + if (nr_nodes <= 0) + return -1; + if (nr_nodes > MAX_NUMNODES) { + pr_info("numa=fake=%d too large, reducing to %d\n", + nr_nodes, MAX_NUMNODES); + nr_nodes = MAX_NUMNODES; + } + + size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes; + /* + * Calculate the number of big nodes that can be allocated as a result + * of consolidating the remainder. + */ + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / + FAKE_NODE_MIN_SIZE; + + size &= FAKE_NODE_MIN_HASH_MASK; + if (!size) { + pr_err("Not enough memory for each node. " + "NUMA emulation disabled.\n"); + return -1; + } + + for (i = 0; i < nr_phys_nodes; i++) + if (physnodes[i].start != physnodes[i].end) + node_set(i, physnode_mask); + + /* + * Continue to fill physical nodes with fake nodes until there is no + * memory left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 end = physnodes[i].start + size; + u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); + + if (ret < big) + end += FAKE_NODE_MIN_SIZE; + + /* + * Continue to add memory to this fake node if its + * non-reserved memory is less than the per-node size. + */ + while (end - physnodes[i].start - + e820_hole_size(physnodes[i].start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > physnodes[i].end) { + end = physnodes[i].end; + break; + } + } + + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (physnodes[i].end - end - + e820_hole_size(end, physnodes[i].end) < size) + end = physnodes[i].end; + + /* + * Avoid allocating more nodes than requested, which can + * happen as a result of rounding down each node's size + * to FAKE_NODE_MIN_SIZE. + */ + if (nodes_weight(physnode_mask) + ret >= nr_nodes) + end = physnodes[i].end; + + if (setup_node_range(ret++, &physnodes[i].start, + end - physnodes[i].start, + physnodes[i].end) < 0) + node_clear(i, physnode_mask); + } + } + return ret; +} + +/* * Splits num_nodes nodes up equally starting at node_start. The return value * is the number of nodes split up and addr is adjusted to be at the end of the * last node allocated. */ -static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, - u64 max_addr, int node_start, +static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, int num_nodes) { unsigned int big; @@ -388,7 +554,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, break; } } - if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) + if (setup_node_range(i, addr, end - *addr, max_addr) < 0) break; } return i - node_start + 1; @@ -399,12 +565,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, * always assigned to a final node and can be asymmetric. Returns the number of * nodes split. */ -static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, - u64 max_addr, int node_start, u64 size) +static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, + u64 size) { int i = node_start; size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; - while (!setup_node_range(i++, nodes, addr, size, max_addr)) + while (!setup_node_range(i++, addr, size, max_addr)) ; return i - node_start; } @@ -413,15 +579,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, * Sets up the system RAM area from start_pfn to last_pfn according to the * numa=fake command-line option. */ -static struct bootnode nodes[MAX_NUMNODES] __initdata; - -static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn) +static int __init numa_emulation(unsigned long start_pfn, + unsigned long last_pfn, int acpi, int k8) { u64 size, addr = start_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT; int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; + int num_phys_nodes; - memset(&nodes, 0, sizeof(nodes)); + num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); /* * If the numa=fake command-line is just a single number N, split the * system RAM into N fake nodes. @@ -429,7 +595,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { long n = simple_strtol(cmdline, NULL, 0); - num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); + num_nodes = split_nodes_interleave(addr, max_addr, + num_phys_nodes, n); if (num_nodes < 0) return num_nodes; goto out; @@ -456,8 +623,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; if (size) for (i = 0; i < coeff; i++, num_nodes++) - if (setup_node_range(num_nodes, nodes, - &addr, size, max_addr) < 0) + if (setup_node_range(num_nodes, &addr, + size, max_addr) < 0) goto done; if (!*cmdline) break; @@ -473,7 +640,7 @@ done: if (addr < max_addr) { if (coeff_flag && coeff < 0) { /* Split remaining nodes into num-sized chunks */ - num_nodes += split_nodes_by_size(nodes, &addr, max_addr, + num_nodes += split_nodes_by_size(&addr, max_addr, num_nodes, num); goto out; } @@ -482,7 +649,7 @@ done: /* Split remaining nodes into coeff chunks */ if (coeff <= 0) break; - num_nodes += split_nodes_equally(nodes, &addr, max_addr, + num_nodes += split_nodes_equally(&addr, max_addr, num_nodes, coeff); break; case ',': @@ -490,8 +657,8 @@ done: break; default: /* Give one final node */ - setup_node_range(num_nodes, nodes, &addr, - max_addr - addr, max_addr); + setup_node_range(num_nodes, &addr, max_addr - addr, + max_addr); num_nodes++; } } @@ -505,14 +672,10 @@ out: } /* - * We need to vacate all active ranges that may have been registered by - * SRAT and set acpi_numa to -1 so that srat_disabled() always returns - * true. NUMA emulation has succeeded so we will not scan ACPI nodes. + * We need to vacate all active ranges that may have been registered for + * the e820 memory map. */ remove_all_active_ranges(); -#ifdef CONFIG_ACPI_NUMA - acpi_numa = -1; -#endif for_each_node_mask(i, node_possible_map) { e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); @@ -524,7 +687,8 @@ out: } #endif /* CONFIG_NUMA_EMU */ -void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) +void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, + int acpi, int k8) { int i; @@ -532,23 +696,22 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) nodes_clear(node_online_map); #ifdef CONFIG_NUMA_EMU - if (cmdline && !numa_emulation(start_pfn, last_pfn)) + if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); #endif #ifdef CONFIG_ACPI_NUMA - if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, - last_pfn << PAGE_SHIFT)) + if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, + last_pfn << PAGE_SHIFT)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); #endif #ifdef CONFIG_K8_NUMA - if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, - last_pfn<<PAGE_SHIFT)) + if (!numa_off && k8 && !k8_scan_nodes()) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); @@ -601,6 +764,25 @@ static __init int numa_setup(char *opt) early_param("numa", numa_setup); #ifdef CONFIG_NUMA + +static __init int find_near_online_node(int node) +{ + int n, val; + int min_val = INT_MAX; + int best_node = -1; + + for_each_online_node(n) { + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + return best_node; +} + /* * Setup early cpu_to_node. * @@ -632,7 +814,7 @@ void __init init_cpu_to_node(void) if (node == NUMA_NO_NODE) continue; if (!node_online(node)) - continue; + node = find_near_online_node(node); numa_set_node(cpu, node); } } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index dd38bfbefd1..1d4eb93d333 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -279,6 +279,22 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) + /* + * Once the kernel maps the text as RO (kernel_set_to_readonly is set), + * kernel text mappings for the large page aligned text, rodata sections + * will be always read-only. For the kernel identity mappings covering + * the holes caused by this alignment can be anything that user asks. + * + * This will preserve the large page mappings for kernel text/data + * at no extra cost. + */ + if (kernel_set_to_readonly && + within(address, (unsigned long)_text, + (unsigned long)__end_rodata_hpage_align)) + pgprot_val(forbidden) |= _PAGE_RW; +#endif + prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); return prot; @@ -1069,12 +1085,18 @@ EXPORT_SYMBOL(set_memory_array_wb); int set_memory_x(unsigned long addr, int numpages) { + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_x); int set_memory_nx(unsigned long addr, int numpages) { + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_nx); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index e78cd0ec2bc..ae9648eb1c7 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -20,6 +20,7 @@ #include <asm/cacheflush.h> #include <asm/processor.h> #include <asm/tlbflush.h> +#include <asm/x86_init.h> #include <asm/pgtable.h> #include <asm/fcntl.h> #include <asm/e820.h> @@ -355,9 +356,6 @@ static int free_ram_pages_type(u64 start, u64 end) * - _PAGE_CACHE_UC_MINUS * - _PAGE_CACHE_UC * - * req_type will have a special case value '-1', when requester want to inherit - * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. - * * If new_type is NULL, function will return an error if it cannot reserve the * region with req_type. If new_type is non-NULL, function will return * available type in new_type in case of no error. In case of any error @@ -377,9 +375,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (!pat_enabled) { /* This is identical to page table setting without PAT */ if (new_type) { - if (req_type == -1) - *new_type = _PAGE_CACHE_WB; - else if (req_type == _PAGE_CACHE_WC) + if (req_type == _PAGE_CACHE_WC) *new_type = _PAGE_CACHE_UC_MINUS; else *new_type = req_type & _PAGE_CACHE_MASK; @@ -388,7 +384,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } /* Low ISA region is always mapped WB in page table. No need to track */ - if (is_ISA_range(start, end - 1)) { + if (x86_platform.is_untracked_pat_range(start, end)) { if (new_type) *new_type = _PAGE_CACHE_WB; return 0; @@ -499,7 +495,7 @@ int free_memtype(u64 start, u64 end) return 0; /* Low ISA region is always mapped WB. No need to track */ - if (is_ISA_range(start, end - 1)) + if (x86_platform.is_untracked_pat_range(start, end)) return 0; is_range_ram = pat_pagerange_is_ram(start, end); @@ -582,7 +578,7 @@ static unsigned long lookup_memtype(u64 paddr) int rettype = _PAGE_CACHE_WB; struct memtype *entry; - if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) + if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) return rettype; if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { @@ -708,9 +704,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (!range_is_allowed(pfn, size)) return 0; - if (file->f_flags & O_SYNC) { + if (file->f_flags & O_DSYNC) flags = _PAGE_CACHE_UC_MINUS; - } #ifdef CONFIG_X86_32 /* @@ -1018,8 +1013,10 @@ static const struct file_operations memtype_fops = { static int __init pat_memtype_list_init(void) { - debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, - NULL, &memtype_fops); + if (pat_enabled) { + debugfs_create_file("pat_memtype_list", S_IRUSR, + arch_debugfs_dir, NULL, &memtype_fops); + } return 0; } diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 513d8ed5d2e..a3250aa3408 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -3,10 +3,8 @@ #include <linux/init.h> #include <asm/pgtable.h> +#include <asm/proto.h> -int nx_enabled; - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static int disable_nx __cpuinitdata; /* @@ -22,48 +20,41 @@ static int __init noexec_setup(char *str) if (!str) return -EINVAL; if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; disable_nx = 0; } else if (!strncmp(str, "off", 3)) { disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; } + x86_configure_nx(); return 0; } early_param("noexec", noexec_setup); -#endif -#ifdef CONFIG_X86_PAE -void __init set_nx(void) +void __cpuinit x86_configure_nx(void) { - unsigned int v[4], l, h; - - if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { - cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); + if (cpu_has_nx && !disable_nx) + __supported_pte_mask |= _PAGE_NX; + else + __supported_pte_mask &= ~_PAGE_NX; +} - if ((v[3] & (1 << 20)) && !disable_nx) { - rdmsr(MSR_EFER, l, h); - l |= EFER_NX; - wrmsr(MSR_EFER, l, h); - nx_enabled = 1; - __supported_pte_mask |= _PAGE_NX; +void __init x86_report_nx(void) +{ + if (!cpu_has_nx) { + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "missing in CPU or disabled in BIOS!\n"); + } else { +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + if (disable_nx) { + printk(KERN_INFO "NX (Execute Disable) protection: " + "disabled by kernel command line option\n"); + } else { + printk(KERN_INFO "NX (Execute Disable) protection: " + "active\n"); } - } -} #else -void set_nx(void) -{ -} + /* 32bit non-PAE kernel, NX cannot be used */ + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "cannot be enabled: non-PAE kernel!\n"); #endif - -#ifdef CONFIG_X86_64 -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || disable_nx) - __supported_pte_mask &= ~_PAGE_NX; + } } -#endif - diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 9d7ce96e5a5..d8907548966 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -290,8 +290,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, start, end); - e820_register_active_regions(node, start >> PAGE_SHIFT, - end >> PAGE_SHIFT); if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { update_nodes_add(node, start, end); @@ -338,6 +336,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) void __init acpi_numa_arch_fixup(void) {} +int __init acpi_get_nodes(struct bootnode *physnodes) +{ + int i; + int ret = 0; + + for_each_node_mask(i, nodes_parsed) { + physnodes[ret].start = nodes[i].start; + physnodes[ret].end = nodes[i].end; + ret++; + } + return ret; +} + /* Use the information discovered above to actually set up the nodes. */ int __init acpi_scan_nodes(unsigned long start, unsigned long end) { @@ -350,11 +361,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) for (i = 0; i < MAX_NUMNODES; i++) cutoff_node(i, start, end); - if (!nodes_cover_memory(nodes)) { - bad_srat(); - return -1; - } - memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, memblk_nodeid); if (memnode_shift < 0) { @@ -364,6 +370,14 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; } + for_each_node_mask(i, nodes_parsed) + e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, + nodes[i].end >> PAGE_SHIFT); + if (!nodes_cover_memory(nodes)) { + bad_srat(); + return -1; + } + /* Account for nodes with cpus and no memory */ nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); @@ -454,7 +468,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) for (i = 0; i < num_nodes; i++) if (fake_nodes[i].start != fake_nodes[i].end) node_set(i, nodes_parsed); - WARN_ON(!nodes_cover_memory(fake_nodes)); } static int null_slit_node_compare(int a, int b) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 36fe08eeb5c..65b58e4b0b8 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -8,6 +8,7 @@ #include <asm/tlbflush.h> #include <asm/mmu_context.h> +#include <asm/cache.h> #include <asm/apic.h> #include <asm/uv/uv.h> @@ -43,7 +44,7 @@ union smp_flush_state { spinlock_t tlbstate_lock; DECLARE_BITMAP(flush_cpumask, NR_CPUS); }; - char pad[CONFIG_X86_INTERNODE_CACHE_BYTES]; + char pad[INTERNODE_CACHE_BYTES]; } ____cacheline_internodealigned_in_smp; /* State is put into the per CPU data section, but padded diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index d49202e740e..564b008a51c 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -15,3 +15,8 @@ obj-$(CONFIG_X86_NUMAQ) += numaq_32.o obj-y += common.o early.o obj-y += amd_bus.o +obj-$(CONFIG_X86_64) += bus_numa.o intel_bus.o + +ifeq ($(CONFIG_PCI_DEBUG),y) +EXTRA_CFLAGS += -DDEBUG +endif diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 1014eb4bfc3..959e548a703 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -7,6 +7,7 @@ #include <asm/pci_x86.h> struct pci_root_info { + struct acpi_device *bridge; char *name; unsigned int res_num; struct resource *res; @@ -58,6 +59,30 @@ bus_has_transparent_bridge(struct pci_bus *bus) return false; } +static void +align_resource(struct acpi_device *bridge, struct resource *res) +{ + int align = (res->flags & IORESOURCE_MEM) ? 16 : 4; + + /* + * Host bridge windows are not BARs, but the decoders on the PCI side + * that claim this address space have starting alignment and length + * constraints, so fix any obvious BIOS goofs. + */ + if (!IS_ALIGNED(res->start, align)) { + dev_printk(KERN_DEBUG, &bridge->dev, + "host bridge window %pR invalid; " + "aligning start to %d-byte boundary\n", res, align); + res->start &= ~(align - 1); + } + if (!IS_ALIGNED(res->end + 1, align)) { + dev_printk(KERN_DEBUG, &bridge->dev, + "host bridge window %pR invalid; " + "aligning end to %d-byte boundary\n", res, align); + res->end = ALIGN(res->end, align) - 1; + } +} + static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data) { @@ -91,11 +116,12 @@ setup_resource(struct acpi_resource *acpi_res, void *data) start = addr.minimum + addr.translation_offset; end = start + addr.address_length - 1; if (info->res_num >= max_root_bus_resources) { - printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx " - "from %s for %s due to _CRS returning more than " - "%d resource descriptors\n", (unsigned long) start, - (unsigned long) end, root->name, info->name, - max_root_bus_resources); + if (pci_probe & PCI_USE__CRS) + printk(KERN_WARNING "PCI: Failed to allocate " + "0x%lx-0x%lx from %s for %s due to _CRS " + "returning more than %d resource descriptors\n", + (unsigned long) start, (unsigned long) end, + root->name, info->name, max_root_bus_resources); return AE_OK; } @@ -105,14 +131,28 @@ setup_resource(struct acpi_resource *acpi_res, void *data) res->start = start; res->end = end; res->child = NULL; + align_resource(info->bridge, res); + + if (!(pci_probe & PCI_USE__CRS)) { + dev_printk(KERN_DEBUG, &info->bridge->dev, + "host bridge window %pR (ignored)\n", res); + return AE_OK; + } if (insert_resource(root, res)) { - printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx " - "from %s for %s\n", (unsigned long) res->start, - (unsigned long) res->end, root->name, info->name); + dev_err(&info->bridge->dev, + "can't allocate host bridge window %pR\n", res); } else { info->bus->resource[info->res_num] = res; info->res_num++; + if (addr.translation_offset) + dev_info(&info->bridge->dev, "host bridge window %pR " + "(PCI address [%#llx-%#llx])\n", + res, res->start - addr.translation_offset, + res->end - addr.translation_offset); + else + dev_info(&info->bridge->dev, + "host bridge window %pR\n", res); } return AE_OK; } @@ -124,6 +164,12 @@ get_current_resources(struct acpi_device *device, int busnum, struct pci_root_info info; size_t size; + if (!(pci_probe & PCI_USE__CRS)) + dev_info(&device->dev, + "ignoring host bridge windows from ACPI; " + "boot with \"pci=use_crs\" to use them\n"); + + info.bridge = device; info.bus = bus; info.res_num = 0; acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, @@ -163,8 +209,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do #endif if (domain && !pci_domains_supported) { - printk(KERN_WARNING "PCI: Multiple domains not supported " - "(dom %d, bus %d)\n", domain, busnum); + printk(KERN_WARNING "pci_bus %04x:%02x: " + "ignored (multiple domains not supported)\n", + domain, busnum); return NULL; } @@ -188,7 +235,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do */ sd = kzalloc(sizeof(*sd), GFP_KERNEL); if (!sd) { - printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); + printk(KERN_WARNING "pci_bus %04x:%02x: " + "ignored (out of memory)\n", domain, busnum); return NULL; } @@ -209,9 +257,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do } else { bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); if (bus) { - if (pci_probe & PCI_USE__CRS) - get_current_resources(device, busnum, domain, - bus); + get_current_resources(device, busnum, domain, bus); bus->subordinate = pci_scan_child_bus(bus); } } diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 572ee9782f2..95ecbd49595 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -6,10 +6,10 @@ #ifdef CONFIG_X86_64 #include <asm/pci-direct.h> -#include <asm/mpspec.h> -#include <linux/cpumask.h> #endif +#include "bus_numa.h" + /* * This discovers the pcibus <-> node mapping on AMD K8. * also get peer root bus resource for io,mmio @@ -17,67 +17,6 @@ #ifdef CONFIG_X86_64 -/* - * sub bus (transparent) will use entres from 3 to store extra from root, - * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES? - */ -#define RES_NUM 16 -struct pci_root_info { - char name[12]; - unsigned int res_num; - struct resource res[RES_NUM]; - int bus_min; - int bus_max; - int node; - int link; -}; - -/* 4 at this time, it may become to 32 */ -#define PCI_ROOT_NR 4 -static int pci_root_num; -static struct pci_root_info pci_root_info[PCI_ROOT_NR]; - -void x86_pci_root_bus_res_quirks(struct pci_bus *b) -{ - int i; - int j; - struct pci_root_info *info; - - /* don't go for it if _CRS is used already */ - if (b->resource[0] != &ioport_resource || - b->resource[1] != &iomem_resource) - return; - - /* if only one root bus, don't need to anything */ - if (pci_root_num < 2) - return; - - for (i = 0; i < pci_root_num; i++) { - if (pci_root_info[i].bus_min == b->number) - break; - } - - if (i == pci_root_num) - return; - - printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", - b->number); - - info = &pci_root_info[i]; - for (j = 0; j < info->res_num; j++) { - struct resource *res; - struct resource *root; - - res = &info->res[j]; - b->resource[j] = res; - if (res->flags & IORESOURCE_IO) - root = &ioport_resource; - else - root = &iomem_resource; - insert_resource(root, res); - } -} - #define RANGE_NUM 16 struct res_range { @@ -130,52 +69,6 @@ static void __init update_range(struct res_range *range, size_t start, } } -static void __init update_res(struct pci_root_info *info, size_t start, - size_t end, unsigned long flags, int merge) -{ - int i; - struct resource *res; - - if (!merge) - goto addit; - - /* try to merge it with old one */ - for (i = 0; i < info->res_num; i++) { - size_t final_start, final_end; - size_t common_start, common_end; - - res = &info->res[i]; - if (res->flags != flags) - continue; - - common_start = max((size_t)res->start, start); - common_end = min((size_t)res->end, end); - if (common_start > common_end + 1) - continue; - - final_start = min((size_t)res->start, start); - final_end = max((size_t)res->end, end); - - res->start = final_start; - res->end = final_end; - return; - } - -addit: - - /* need to add that */ - if (info->res_num >= RES_NUM) - return; - - res = &info->res[info->res_num]; - res->name = info->name; - res->flags = flags; - res->start = start; - res->end = end; - res->child = NULL; - info->res_num++; -} - struct pci_hostbridge_probe { u32 bus; u32 slot; @@ -230,7 +123,6 @@ static int __init early_fill_mp_bus_info(void) int j; unsigned bus; unsigned slot; - int found; int node; int link; int def_node; @@ -247,7 +139,7 @@ static int __init early_fill_mp_bus_info(void) if (!early_pci_allowed()) return -1; - found = 0; + found_all_numa_early = 0; for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { u32 id; u16 device; @@ -261,12 +153,12 @@ static int __init early_fill_mp_bus_info(void) device = (id>>16) & 0xffff; if (pci_probes[i].vendor == vendor && pci_probes[i].device == device) { - found = 1; + found_all_numa_early = 1; break; } } - if (!found) + if (!found_all_numa_early) return 0; pci_root_num = 0; @@ -488,7 +380,7 @@ static int __init early_fill_mp_bus_info(void) info = &pci_root_info[i]; res_num = info->res_num; busnum = info->bus_min; - printk(KERN_DEBUG "bus: [%02x,%02x] on node %x link %x\n", + printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n", info->bus_min, info->bus_max, info->node, info->link); for (j = 0; j < res_num; j++) { res = &info->res[j]; diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c new file mode 100644 index 00000000000..145df00e038 --- /dev/null +++ b/arch/x86/pci/bus_numa.c @@ -0,0 +1,101 @@ +#include <linux/init.h> +#include <linux/pci.h> + +#include "bus_numa.h" + +int pci_root_num; +struct pci_root_info pci_root_info[PCI_ROOT_NR]; +int found_all_numa_early; + +void x86_pci_root_bus_res_quirks(struct pci_bus *b) +{ + int i; + int j; + struct pci_root_info *info; + + /* don't go for it if _CRS is used already */ + if (b->resource[0] != &ioport_resource || + b->resource[1] != &iomem_resource) + return; + + if (!pci_root_num) + return; + + /* for amd, if only one root bus, don't need to do anything */ + if (pci_root_num < 2 && found_all_numa_early) + return; + + for (i = 0; i < pci_root_num; i++) { + if (pci_root_info[i].bus_min == b->number) + break; + } + + if (i == pci_root_num) + return; + + printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", + b->number); + + info = &pci_root_info[i]; + for (j = 0; j < info->res_num; j++) { + struct resource *res; + struct resource *root; + + res = &info->res[j]; + b->resource[j] = res; + if (res->flags & IORESOURCE_IO) + root = &ioport_resource; + else + root = &iomem_resource; + insert_resource(root, res); + } +} + +void __init update_res(struct pci_root_info *info, size_t start, + size_t end, unsigned long flags, int merge) +{ + int i; + struct resource *res; + + if (start > end) + return; + + if (!merge) + goto addit; + + /* try to merge it with old one */ + for (i = 0; i < info->res_num; i++) { + size_t final_start, final_end; + size_t common_start, common_end; + + res = &info->res[i]; + if (res->flags != flags) + continue; + + common_start = max((size_t)res->start, start); + common_end = min((size_t)res->end, end); + if (common_start > common_end + 1) + continue; + + final_start = min((size_t)res->start, start); + final_end = max((size_t)res->end, end); + + res->start = final_start; + res->end = final_end; + return; + } + +addit: + + /* need to add that */ + if (info->res_num >= RES_NUM) + return; + + res = &info->res[info->res_num]; + res->name = info->name; + res->flags = flags; + res->start = start; + res->end = end; + res->child = NULL; + info->res_num++; +} diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h new file mode 100644 index 00000000000..adbc23fe82a --- /dev/null +++ b/arch/x86/pci/bus_numa.h @@ -0,0 +1,27 @@ +#ifdef CONFIG_X86_64 + +/* + * sub bus (transparent) will use entres from 3 to store extra from + * root, so need to make sure we have enough slot there, Should we + * increase PCI_BUS_NUM_RESOURCES? + */ +#define RES_NUM 16 +struct pci_root_info { + char name[12]; + unsigned int res_num; + struct resource res[RES_NUM]; + int bus_min; + int bus_max; + int node; + int link; +}; + +/* 4 at this time, it may become to 32 */ +#define PCI_ROOT_NR 4 +extern int pci_root_num; +extern struct pci_root_info pci_root_info[PCI_ROOT_NR]; +extern int found_all_numa_early; + +extern void update_res(struct pci_root_info *info, size_t start, + size_t end, unsigned long flags, int merge); +#endif diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 1331fcf2614..d2552c68e94 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -410,8 +410,6 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) return bus; } -extern u8 pci_cache_line_size; - int __init pcibios_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -422,15 +420,19 @@ int __init pcibios_init(void) } /* - * Assume PCI cacheline size of 32 bytes for all x86s except K7/K8 - * and P4. It's also good for 386/486s (which actually have 16) + * Set PCI cacheline size to that of the CPU if the CPU has reported it. + * (For older CPUs that don't support cpuid, we se it to 32 bytes + * It's also good for 386/486s (which actually have 16) * as quite a few PCI devices do not support smaller values. */ - pci_cache_line_size = 32 >> 2; - if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD) - pci_cache_line_size = 64 >> 2; /* K7 & K8 */ - else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL) - pci_cache_line_size = 128 >> 2; /* P4 */ + if (c->x86_clflush_size > 0) { + pci_dfl_cache_line_size = c->x86_clflush_size >> 2; + printk(KERN_DEBUG "PCI: pci_cache_line_size set to %d bytes\n", + pci_dfl_cache_line_size << 2); + } else { + pci_dfl_cache_line_size = 32 >> 2; + printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n"); + } pcibios_resource_survey(); diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c index aaf26ae58cd..d1067d539be 100644 --- a/arch/x86/pci/early.c +++ b/arch/x86/pci/early.c @@ -12,8 +12,6 @@ u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset) u32 v; outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); v = inl(0xcfc); - if (v != 0xffffffff) - pr_debug("%x reading 4 from %x: %x\n", slot, offset, v); return v; } @@ -22,7 +20,6 @@ u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset) u8 v; outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); v = inb(0xcfc + (offset&3)); - pr_debug("%x reading 1 from %x: %x\n", slot, offset, v); return v; } @@ -31,28 +28,24 @@ u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset) u16 v; outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); v = inw(0xcfc + (offset&2)); - pr_debug("%x reading 2 from %x: %x\n", slot, offset, v); return v; } void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val) { - pr_debug("%x writing to %x: %x\n", slot, offset, val); outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outl(val, 0xcfc); } void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val) { - pr_debug("%x writing to %x: %x\n", slot, offset, val); outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outb(val, 0xcfc + (offset&3)); } void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val) { - pr_debug("%x writing to %x: %x\n", slot, offset, val); outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outw(val, 0xcfc + (offset&2)); } diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index b22d13b0c71..5dc9e8c63fc 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -129,7 +129,9 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) continue; if (!r->start || pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); + dev_info(&dev->dev, + "can't reserve window %pR\n", + r); /* * Something is wrong with the region. * Invalidate the resource to prevent @@ -144,16 +146,29 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) } } +struct pci_check_idx_range { + int start; + int end; +}; + static void __init pcibios_allocate_resources(int pass) { struct pci_dev *dev = NULL; - int idx, disabled; + int idx, disabled, i; u16 command; struct resource *r; + struct pci_check_idx_range idx_range[] = { + { PCI_STD_RESOURCES, PCI_STD_RESOURCE_END }, +#ifdef CONFIG_PCI_IOV + { PCI_IOV_RESOURCES, PCI_IOV_RESOURCE_END }, +#endif + }; + for_each_pci_dev(dev) { pci_read_config_word(dev, PCI_COMMAND, &command); - for (idx = 0; idx < PCI_ROM_RESOURCE; idx++) { + for (i = 0; i < ARRAY_SIZE(idx_range); i++) + for (idx = idx_range[i].start; idx <= idx_range[i].end; idx++) { r = &dev->resource[idx]; if (r->parent) /* Already allocated */ continue; @@ -164,12 +179,12 @@ static void __init pcibios_allocate_resources(int pass) else disabled = !(command & PCI_COMMAND_MEMORY); if (pass == disabled) { - dev_dbg(&dev->dev, "resource %#08llx-%#08llx (f=%lx, d=%d, p=%d)\n", - (unsigned long long) r->start, - (unsigned long long) r->end, - r->flags, disabled, pass); + dev_dbg(&dev->dev, + "BAR %d: reserving %pr (d=%d, p=%d)\n", + idx, r, disabled, pass); if (pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); + dev_info(&dev->dev, + "can't reserve %pR\n", r); /* We'll assign a new address later */ r->end -= r->start; r->start = 0; @@ -182,7 +197,7 @@ static void __init pcibios_allocate_resources(int pass) /* Turn the ROM off, leave the resource region, * but keep it unregistered. */ u32 reg; - dev_dbg(&dev->dev, "disabling ROM\n"); + dev_dbg(&dev->dev, "disabling ROM %pR\n", r); r->flags &= ~IORESOURCE_ROM_ENABLE; pci_read_config_dword(dev, dev->rom_base_reg, ®); @@ -282,6 +297,15 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, return -EINVAL; prot = pgprot_val(vma->vm_page_prot); + + /* + * Return error if pat is not enabled and write_combine is requested. + * Caller can followup with UC MINUS request and add a WC mtrr if there + * is a free mtrr slot. + */ + if (!pat_enabled && write_combine) + return -EINVAL; + if (pat_enabled && write_combine) prot |= _PAGE_CACHE_WC; else if (pat_enabled || boot_cpu_data.x86 > 3) diff --git a/arch/x86/pci/intel_bus.c b/arch/x86/pci/intel_bus.c new file mode 100644 index 00000000000..b7a55dc55d1 --- /dev/null +++ b/arch/x86/pci/intel_bus.c @@ -0,0 +1,90 @@ +/* + * to read io range from IOH pci conf, need to do it after mmconfig is there + */ + +#include <linux/delay.h> +#include <linux/dmi.h> +#include <linux/pci.h> +#include <linux/init.h> +#include <asm/pci_x86.h> + +#include "bus_numa.h" + +static inline void print_ioh_resources(struct pci_root_info *info) +{ + int res_num; + int busnum; + int i; + + printk(KERN_DEBUG "IOH bus: [%02x, %02x]\n", + info->bus_min, info->bus_max); + res_num = info->res_num; + busnum = info->bus_min; + for (i = 0; i < res_num; i++) { + struct resource *res; + + res = &info->res[i]; + printk(KERN_DEBUG "IOH bus: %02x index %x %s: [%llx, %llx]\n", + busnum, i, + (res->flags & IORESOURCE_IO) ? "io port" : + "mmio", + res->start, res->end); + } +} + +#define IOH_LIO 0x108 +#define IOH_LMMIOL 0x10c +#define IOH_LMMIOH 0x110 +#define IOH_LMMIOH_BASEU 0x114 +#define IOH_LMMIOH_LIMITU 0x118 +#define IOH_LCFGBUS 0x11c + +static void __devinit pci_root_bus_res(struct pci_dev *dev) +{ + u16 word; + u32 dword; + struct pci_root_info *info; + u16 io_base, io_end; + u32 mmiol_base, mmiol_end; + u64 mmioh_base, mmioh_end; + int bus_base, bus_end; + + if (pci_root_num >= PCI_ROOT_NR) { + printk(KERN_DEBUG "intel_bus.c: PCI_ROOT_NR is too small\n"); + return; + } + + info = &pci_root_info[pci_root_num]; + pci_root_num++; + + pci_read_config_word(dev, IOH_LCFGBUS, &word); + bus_base = (word & 0xff); + bus_end = (word & 0xff00) >> 8; + sprintf(info->name, "PCI Bus #%02x", bus_base); + info->bus_min = bus_base; + info->bus_max = bus_end; + + pci_read_config_word(dev, IOH_LIO, &word); + io_base = (word & 0xf0) << (12 - 4); + io_end = (word & 0xf000) | 0xfff; + update_res(info, io_base, io_end, IORESOURCE_IO, 0); + + pci_read_config_dword(dev, IOH_LMMIOL, &dword); + mmiol_base = (dword & 0xff00) << (24 - 8); + mmiol_end = (dword & 0xff000000) | 0xffffff; + update_res(info, mmiol_base, mmiol_end, IORESOURCE_MEM, 0); + + pci_read_config_dword(dev, IOH_LMMIOH, &dword); + mmioh_base = ((u64)(dword & 0xfc00)) << (26 - 10); + mmioh_end = ((u64)(dword & 0xfc000000) | 0x3ffffff); + pci_read_config_dword(dev, IOH_LMMIOH_BASEU, &dword); + mmioh_base |= ((u64)(dword & 0x7ffff)) << 32; + pci_read_config_dword(dev, IOH_LMMIOH_LIMITU, &dword); + mmioh_end |= ((u64)(dword & 0x7ffff)) << 32; + update_res(info, mmioh_base, mmioh_end, IORESOURCE_MEM, 0); + + print_ioh_resources(info); +} + +/* intel IOH */ +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x342e, pci_root_bus_res); diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 602c172d3bd..b19d1e54201 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -15,48 +15,98 @@ #include <linux/acpi.h> #include <linux/sfi_acpi.h> #include <linux/bitmap.h> -#include <linux/sort.h> +#include <linux/dmi.h> #include <asm/e820.h> #include <asm/pci_x86.h> #include <asm/acpi.h> #define PREFIX "PCI: " -/* aperture is up to 256MB but BIOS may reserve less */ -#define MMCONFIG_APER_MIN (2 * 1024*1024) -#define MMCONFIG_APER_MAX (256 * 1024*1024) - /* Indicate if the mmcfg resources have been placed into the resource table. */ static int __initdata pci_mmcfg_resources_inserted; -static __init int extend_mmcfg(int num) +LIST_HEAD(pci_mmcfg_list); + +static __init void pci_mmconfig_remove(struct pci_mmcfg_region *cfg) { - struct acpi_mcfg_allocation *new; - int new_num = pci_mmcfg_config_num + num; + if (cfg->res.parent) + release_resource(&cfg->res); + list_del(&cfg->list); + kfree(cfg); +} - new = kzalloc(sizeof(pci_mmcfg_config[0]) * new_num, GFP_KERNEL); - if (!new) - return -1; +static __init void free_all_mmcfg(void) +{ + struct pci_mmcfg_region *cfg, *tmp; - if (pci_mmcfg_config) { - memcpy(new, pci_mmcfg_config, - sizeof(pci_mmcfg_config[0]) * new_num); - kfree(pci_mmcfg_config); + pci_mmcfg_arch_free(); + list_for_each_entry_safe(cfg, tmp, &pci_mmcfg_list, list) + pci_mmconfig_remove(cfg); +} + +static __init void list_add_sorted(struct pci_mmcfg_region *new) +{ + struct pci_mmcfg_region *cfg; + + /* keep list sorted by segment and starting bus number */ + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + if (cfg->segment > new->segment || + (cfg->segment == new->segment && + cfg->start_bus >= new->start_bus)) { + list_add_tail(&new->list, &cfg->list); + return; + } } - pci_mmcfg_config = new; + list_add_tail(&new->list, &pci_mmcfg_list); +} - return 0; +static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, + int end, u64 addr) +{ + struct pci_mmcfg_region *new; + int num_buses; + struct resource *res; + + if (addr == 0) + return NULL; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->address = addr; + new->segment = segment; + new->start_bus = start; + new->end_bus = end; + + list_add_sorted(new); + + num_buses = end - start + 1; + res = &new->res; + res->start = addr + PCI_MMCFG_BUS_OFFSET(start); + res->end = addr + PCI_MMCFG_BUS_OFFSET(num_buses) - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN, + "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end); + res->name = new->name; + + printk(KERN_INFO PREFIX "MMCONFIG for domain %04x [bus %02x-%02x] at " + "%pR (base %#lx)\n", segment, start, end, &new->res, + (unsigned long) addr); + + return new; } -static __init void fill_one_mmcfg(u64 addr, int segment, int start, int end) +struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus) { - int i = pci_mmcfg_config_num; + struct pci_mmcfg_region *cfg; - pci_mmcfg_config_num++; - pci_mmcfg_config[i].address = addr; - pci_mmcfg_config[i].pci_segment = segment; - pci_mmcfg_config[i].start_bus_number = start; - pci_mmcfg_config[i].end_bus_number = end; + list_for_each_entry(cfg, &pci_mmcfg_list, list) + if (cfg->segment == segment && + cfg->start_bus <= bus && bus <= cfg->end_bus) + return cfg; + + return NULL; } static const char __init *pci_mmcfg_e7520(void) @@ -68,11 +118,9 @@ static const char __init *pci_mmcfg_e7520(void) if (win == 0x0000 || win == 0xf000) return NULL; - if (extend_mmcfg(1) == -1) + if (pci_mmconfig_add(0, 0, 255, win << 16) == NULL) return NULL; - fill_one_mmcfg(win << 16, 0, 0, 255); - return "Intel Corporation E7520 Memory Controller Hub"; } @@ -114,11 +162,9 @@ static const char __init *pci_mmcfg_intel_945(void) if ((pciexbar & mask) >= 0xf0000000U) return NULL; - if (extend_mmcfg(1) == -1) + if (pci_mmconfig_add(0, 0, (len >> 20) - 1, pciexbar & mask) == NULL) return NULL; - fill_one_mmcfg(pciexbar & mask, 0, 0, (len >> 20) - 1); - return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub"; } @@ -127,7 +173,7 @@ static const char __init *pci_mmcfg_amd_fam10h(void) u32 low, high, address; u64 base, msr; int i; - unsigned segnbits = 0, busnbits; + unsigned segnbits = 0, busnbits, end_bus; if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF)) return NULL; @@ -161,11 +207,13 @@ static const char __init *pci_mmcfg_amd_fam10h(void) busnbits = 8; } - if (extend_mmcfg(1 << segnbits) == -1) - return NULL; - + end_bus = (1 << busnbits) - 1; for (i = 0; i < (1 << segnbits); i++) - fill_one_mmcfg(base + (1<<28) * i, i, 0, (1 << busnbits) - 1); + if (pci_mmconfig_add(i, 0, end_bus, + base + (1<<28) * i) == NULL) { + free_all_mmcfg(); + return NULL; + } return "AMD Family 10h NB"; } @@ -190,7 +238,7 @@ static const char __init *pci_mmcfg_nvidia_mcp55(void) /* * do check if amd fam10h already took over */ - if (!acpi_disabled || pci_mmcfg_config_num || mcp55_checked) + if (!acpi_disabled || !list_empty(&pci_mmcfg_list) || mcp55_checked) return NULL; mcp55_checked = true; @@ -213,16 +261,14 @@ static const char __init *pci_mmcfg_nvidia_mcp55(void) if (!(extcfg & extcfg_enable_mask)) continue; - if (extend_mmcfg(1) == -1) - continue; - size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift; base = extcfg & extcfg_base_mask[size_index]; /* base could > 4G */ base <<= extcfg_base_lshift; start = (extcfg & extcfg_start_mask) >> extcfg_start_shift; end = start + extcfg_sizebus[size_index] - 1; - fill_one_mmcfg(base, 0, start, end); + if (pci_mmconfig_add(0, start, end, base) == NULL) + continue; mcp55_mmconf_found++; } @@ -253,45 +299,27 @@ static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = { 0x0369, pci_mmcfg_nvidia_mcp55 }, }; -static int __init cmp_mmcfg(const void *x1, const void *x2) -{ - const typeof(pci_mmcfg_config[0]) *m1 = x1; - const typeof(pci_mmcfg_config[0]) *m2 = x2; - int start1, start2; - - start1 = m1->start_bus_number; - start2 = m2->start_bus_number; - - return start1 - start2; -} - static void __init pci_mmcfg_check_end_bus_number(void) { - int i; - typeof(pci_mmcfg_config[0]) *cfg, *cfgx; - - /* sort them at first */ - sort(pci_mmcfg_config, pci_mmcfg_config_num, - sizeof(pci_mmcfg_config[0]), cmp_mmcfg, NULL); + struct pci_mmcfg_region *cfg, *cfgx; /* last one*/ - if (pci_mmcfg_config_num > 0) { - i = pci_mmcfg_config_num - 1; - cfg = &pci_mmcfg_config[i]; - if (cfg->end_bus_number < cfg->start_bus_number) - cfg->end_bus_number = 255; - } + cfg = list_entry(pci_mmcfg_list.prev, typeof(*cfg), list); + if (cfg) + if (cfg->end_bus < cfg->start_bus) + cfg->end_bus = 255; - /* don't overlap please */ - for (i = 0; i < pci_mmcfg_config_num - 1; i++) { - cfg = &pci_mmcfg_config[i]; - cfgx = &pci_mmcfg_config[i+1]; + if (list_is_singular(&pci_mmcfg_list)) + return; - if (cfg->end_bus_number < cfg->start_bus_number) - cfg->end_bus_number = 255; + /* don't overlap please */ + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + if (cfg->end_bus < cfg->start_bus) + cfg->end_bus = 255; - if (cfg->end_bus_number >= cfgx->start_bus_number) - cfg->end_bus_number = cfgx->start_bus_number - 1; + cfgx = list_entry(cfg->list.next, typeof(*cfg), list); + if (cfg != cfgx && cfg->end_bus >= cfgx->start_bus) + cfg->end_bus = cfgx->start_bus - 1; } } @@ -306,8 +334,7 @@ static int __init pci_mmcfg_check_hostbridge(void) if (!raw_pci_ops) return 0; - pci_mmcfg_config_num = 0; - pci_mmcfg_config = NULL; + free_all_mmcfg(); for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) { bus = pci_mmcfg_probes[i].bus; @@ -322,45 +349,22 @@ static int __init pci_mmcfg_check_hostbridge(void) name = pci_mmcfg_probes[i].probe(); if (name) - printk(KERN_INFO "PCI: Found %s with MMCONFIG support.\n", + printk(KERN_INFO PREFIX "%s with MMCONFIG support\n", name); } /* some end_bus_number is crazy, fix it */ pci_mmcfg_check_end_bus_number(); - return pci_mmcfg_config_num != 0; + return !list_empty(&pci_mmcfg_list); } static void __init pci_mmcfg_insert_resources(void) { -#define PCI_MMCFG_RESOURCE_NAME_LEN 24 - int i; - struct resource *res; - char *names; - unsigned num_buses; - - res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res), - pci_mmcfg_config_num, GFP_KERNEL); - if (!res) { - printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n"); - return; - } + struct pci_mmcfg_region *cfg; - names = (void *)&res[pci_mmcfg_config_num]; - for (i = 0; i < pci_mmcfg_config_num; i++, res++) { - struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i]; - num_buses = cfg->end_bus_number - cfg->start_bus_number + 1; - res->name = names; - snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, - "PCI MMCONFIG %u [%02x-%02x]", cfg->pci_segment, - cfg->start_bus_number, cfg->end_bus_number); - res->start = cfg->address + (cfg->start_bus_number << 20); - res->end = res->start + (num_buses << 20) - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - insert_resource(&iomem_resource, res); - names += PCI_MMCFG_RESOURCE_NAME_LEN; - } + list_for_each_entry(cfg, &pci_mmcfg_list, list) + insert_resource(&iomem_resource, &cfg->res); /* Mark that the resources have been inserted. */ pci_mmcfg_resources_inserted = 1; @@ -437,11 +441,12 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used) typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); static int __init is_mmconf_reserved(check_reserved_t is_reserved, - u64 addr, u64 size, int i, - typeof(pci_mmcfg_config[0]) *cfg, int with_e820) + struct pci_mmcfg_region *cfg, int with_e820) { + u64 addr = cfg->res.start; + u64 size = resource_size(&cfg->res); u64 old_size = size; - int valid = 0; + int valid = 0, num_buses; while (!is_reserved(addr, addr + size, E820_RESERVED)) { size >>= 1; @@ -450,19 +455,25 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved, } if (size >= (16UL<<20) || size == old_size) { - printk(KERN_NOTICE - "PCI: MCFG area at %Lx reserved in %s\n", - addr, with_e820?"E820":"ACPI motherboard resources"); + printk(KERN_INFO PREFIX "MMCONFIG at %pR reserved in %s\n", + &cfg->res, + with_e820 ? "E820" : "ACPI motherboard resources"); valid = 1; if (old_size != size) { - /* update end_bus_number */ - cfg->end_bus_number = cfg->start_bus_number + ((size>>20) - 1); - printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx " - "segment %hu buses %u - %u\n", - i, (unsigned long)cfg->address, cfg->pci_segment, - (unsigned int)cfg->start_bus_number, - (unsigned int)cfg->end_bus_number); + /* update end_bus */ + cfg->end_bus = cfg->start_bus + ((size>>20) - 1); + num_buses = cfg->end_bus - cfg->start_bus + 1; + cfg->res.end = cfg->res.start + + PCI_MMCFG_BUS_OFFSET(num_buses) - 1; + snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN, + "PCI MMCONFIG %04x [bus %02x-%02x]", + cfg->segment, cfg->start_bus, cfg->end_bus); + printk(KERN_INFO PREFIX + "MMCONFIG for %04x [bus%02x-%02x] " + "at %pR (base %#lx) (size reduced!)\n", + cfg->segment, cfg->start_bus, cfg->end_bus, + &cfg->res, (unsigned long) cfg->address); } } @@ -471,45 +482,26 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved, static void __init pci_mmcfg_reject_broken(int early) { - typeof(pci_mmcfg_config[0]) *cfg; - int i; + struct pci_mmcfg_region *cfg; - if ((pci_mmcfg_config_num == 0) || - (pci_mmcfg_config == NULL) || - (pci_mmcfg_config[0].address == 0)) - return; - - for (i = 0; i < pci_mmcfg_config_num; i++) { + list_for_each_entry(cfg, &pci_mmcfg_list, list) { int valid = 0; - u64 addr, size; - - cfg = &pci_mmcfg_config[i]; - addr = cfg->start_bus_number; - addr <<= 20; - addr += cfg->address; - size = cfg->end_bus_number + 1 - cfg->start_bus_number; - size <<= 20; - printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx " - "segment %hu buses %u - %u\n", - i, (unsigned long)cfg->address, cfg->pci_segment, - (unsigned int)cfg->start_bus_number, - (unsigned int)cfg->end_bus_number); if (!early && !acpi_disabled) - valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); + valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0); if (valid) continue; if (!early) - printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" - " reserved in ACPI motherboard resources\n", - cfg->address); + printk(KERN_ERR FW_BUG PREFIX + "MMCONFIG at %pR not reserved in " + "ACPI motherboard resources\n", &cfg->res); /* Don't try to do this check unless configuration type 1 is available. how about type 2 ?*/ if (raw_pci_ops) - valid = is_mmconf_reserved(e820_all_mapped, addr, size, i, cfg, 1); + valid = is_mmconf_reserved(e820_all_mapped, cfg, 1); if (!valid) goto reject; @@ -518,34 +510,41 @@ static void __init pci_mmcfg_reject_broken(int early) return; reject: - printk(KERN_INFO "PCI: Not using MMCONFIG.\n"); - pci_mmcfg_arch_free(); - kfree(pci_mmcfg_config); - pci_mmcfg_config = NULL; - pci_mmcfg_config_num = 0; + printk(KERN_INFO PREFIX "not using MMCONFIG\n"); + free_all_mmcfg(); } static int __initdata known_bridge; -static int acpi_mcfg_64bit_base_addr __initdata = FALSE; +static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, + struct acpi_mcfg_allocation *cfg) +{ + int year; -/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ -struct acpi_mcfg_allocation *pci_mmcfg_config; -int pci_mmcfg_config_num; + if (cfg->address < 0xFFFFFFFF) + return 0; -static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) -{ if (!strcmp(mcfg->header.oem_id, "SGI")) - acpi_mcfg_64bit_base_addr = TRUE; + return 0; - return 0; + if (mcfg->header.revision >= 1) { + if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && + year >= 2010) + return 0; + } + + printk(KERN_ERR PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx " + "is above 4GB, ignored\n", cfg->pci_segment, + cfg->start_bus_number, cfg->end_bus_number, cfg->address); + return -EINVAL; } static int __init pci_parse_mcfg(struct acpi_table_header *header) { struct acpi_table_mcfg *mcfg; + struct acpi_mcfg_allocation *cfg_table, *cfg; unsigned long i; - int config_size; + int entries; if (!header) return -EINVAL; @@ -553,38 +552,33 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header) mcfg = (struct acpi_table_mcfg *)header; /* how many config structures do we have */ - pci_mmcfg_config_num = 0; + free_all_mmcfg(); + entries = 0; i = header->length - sizeof(struct acpi_table_mcfg); while (i >= sizeof(struct acpi_mcfg_allocation)) { - ++pci_mmcfg_config_num; + entries++; i -= sizeof(struct acpi_mcfg_allocation); }; - if (pci_mmcfg_config_num == 0) { + if (entries == 0) { printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); return -ENODEV; } - config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); - pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); - if (!pci_mmcfg_config) { - printk(KERN_WARNING PREFIX - "No memory for MCFG config tables\n"); - return -ENOMEM; - } - - memcpy(pci_mmcfg_config, &mcfg[1], config_size); - - acpi_mcfg_oem_check(mcfg); - - for (i = 0; i < pci_mmcfg_config_num; ++i) { - if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) && - !acpi_mcfg_64bit_base_addr) { - printk(KERN_ERR PREFIX - "MMCONFIG not in low 4GB of memory\n"); - kfree(pci_mmcfg_config); - pci_mmcfg_config_num = 0; + cfg_table = (struct acpi_mcfg_allocation *) &mcfg[1]; + for (i = 0; i < entries; i++) { + cfg = &cfg_table[i]; + if (acpi_mcfg_check_entry(mcfg, cfg)) { + free_all_mmcfg(); return -ENODEV; } + + if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number, + cfg->end_bus_number, cfg->address) == NULL) { + printk(KERN_WARNING PREFIX + "no memory for MCFG entries\n"); + free_all_mmcfg(); + return -ENOMEM; + } } return 0; @@ -614,9 +608,7 @@ static void __init __pci_mmcfg_init(int early) pci_mmcfg_reject_broken(early); - if ((pci_mmcfg_config_num == 0) || - (pci_mmcfg_config == NULL) || - (pci_mmcfg_config[0].address == 0)) + if (list_empty(&pci_mmcfg_list)) return; if (pci_mmcfg_arch_init()) @@ -648,9 +640,7 @@ static int __init pci_mmcfg_late_insert_resources(void) */ if ((pci_mmcfg_resources_inserted == 1) || (pci_probe & PCI_PROBE_MMCONF) == 0 || - (pci_mmcfg_config_num == 0) || - (pci_mmcfg_config == NULL) || - (pci_mmcfg_config[0].address == 0)) + list_empty(&pci_mmcfg_list)) return 1; /* diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index f10a7e94a84..90d5fd476ed 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -27,18 +27,10 @@ static int mmcfg_last_accessed_cpu; */ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) { - struct acpi_mcfg_allocation *cfg; - int cfg_num; - - for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { - cfg = &pci_mmcfg_config[cfg_num]; - if (cfg->pci_segment == seg && - (cfg->start_bus_number <= bus) && - (cfg->end_bus_number >= bus)) - return cfg->address; - } + struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus); - /* Fall back to type 0 */ + if (cfg) + return cfg->address; return 0; } @@ -47,7 +39,7 @@ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) */ static void pci_exp_set_dev_base(unsigned int base, int bus, int devfn) { - u32 dev_base = base | (bus << 20) | (devfn << 12); + u32 dev_base = base | PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12); int cpu = smp_processor_id(); if (dev_base != mmcfg_last_accessed_device || cpu != mmcfg_last_accessed_cpu) { diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index 94349f8b2f9..e783841bd1d 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -12,38 +12,15 @@ #include <asm/e820.h> #include <asm/pci_x86.h> -/* Static virtual mapping of the MMCONFIG aperture */ -struct mmcfg_virt { - struct acpi_mcfg_allocation *cfg; - char __iomem *virt; -}; -static struct mmcfg_virt *pci_mmcfg_virt; - -static char __iomem *get_virt(unsigned int seg, unsigned bus) -{ - struct acpi_mcfg_allocation *cfg; - int cfg_num; - - for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { - cfg = pci_mmcfg_virt[cfg_num].cfg; - if (cfg->pci_segment == seg && - (cfg->start_bus_number <= bus) && - (cfg->end_bus_number >= bus)) - return pci_mmcfg_virt[cfg_num].virt; - } - - /* Fall back to type 0 */ - return NULL; -} +#define PREFIX "PCI: " static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) { - char __iomem *addr; + struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus); - addr = get_virt(seg, bus); - if (!addr) - return NULL; - return addr + ((bus << 20) | (devfn << 12)); + if (cfg && cfg->virt) + return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12)); + return NULL; } static int pci_mmcfg_read(unsigned int seg, unsigned int bus, @@ -109,42 +86,30 @@ static struct pci_raw_ops pci_mmcfg = { .write = pci_mmcfg_write, }; -static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) +static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg) { void __iomem *addr; u64 start, size; + int num_buses; - start = cfg->start_bus_number; - start <<= 20; - start += cfg->address; - size = cfg->end_bus_number + 1 - cfg->start_bus_number; - size <<= 20; + start = cfg->address + PCI_MMCFG_BUS_OFFSET(cfg->start_bus); + num_buses = cfg->end_bus - cfg->start_bus + 1; + size = PCI_MMCFG_BUS_OFFSET(num_buses); addr = ioremap_nocache(start, size); - if (addr) { - printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n", - start, start + size - 1); - addr -= cfg->start_bus_number << 20; - } + if (addr) + addr -= PCI_MMCFG_BUS_OFFSET(cfg->start_bus); return addr; } int __init pci_mmcfg_arch_init(void) { - int i; - pci_mmcfg_virt = kzalloc(sizeof(*pci_mmcfg_virt) * - pci_mmcfg_config_num, GFP_KERNEL); - if (pci_mmcfg_virt == NULL) { - printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n"); - return 0; - } + struct pci_mmcfg_region *cfg; - for (i = 0; i < pci_mmcfg_config_num; ++i) { - pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; - pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]); - if (!pci_mmcfg_virt[i].virt) { - printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " - "segment %d\n", - pci_mmcfg_config[i].pci_segment); + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + cfg->virt = mcfg_ioremap(cfg); + if (!cfg->virt) { + printk(KERN_ERR PREFIX "can't map MMCONFIG at %pR\n", + &cfg->res); pci_mmcfg_arch_free(); return 0; } @@ -155,19 +120,12 @@ int __init pci_mmcfg_arch_init(void) void __init pci_mmcfg_arch_free(void) { - int i; - - if (pci_mmcfg_virt == NULL) - return; + struct pci_mmcfg_region *cfg; - for (i = 0; i < pci_mmcfg_config_num; ++i) { - if (pci_mmcfg_virt[i].virt) { - iounmap(pci_mmcfg_virt[i].virt + (pci_mmcfg_virt[i].cfg->start_bus_number << 20)); - pci_mmcfg_virt[i].virt = NULL; - pci_mmcfg_virt[i].cfg = NULL; + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + if (cfg->virt) { + iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus)); + cfg->virt = NULL; } } - - kfree(pci_mmcfg_virt); - pci_mmcfg_virt = NULL; } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index dfbf70e6586..2b26dd5930c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -27,7 +27,9 @@ #include <linux/page-flags.h> #include <linux/highmem.h> #include <linux/console.h> +#include <linux/pci.h> +#include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/interface/version.h> #include <xen/interface/physdev.h> @@ -138,24 +140,23 @@ static void xen_vcpu_setup(int cpu) */ void xen_vcpu_restore(void) { - if (have_vcpu_info_placement) { - int cpu; + int cpu; - for_each_online_cpu(cpu) { - bool other_cpu = (cpu != smp_processor_id()); + for_each_online_cpu(cpu) { + bool other_cpu = (cpu != smp_processor_id()); - if (other_cpu && - HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) - BUG(); + if (other_cpu && + HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) + BUG(); - xen_vcpu_setup(cpu); + xen_setup_runstate_info(cpu); - if (other_cpu && - HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) - BUG(); - } + if (have_vcpu_info_placement) + xen_vcpu_setup(cpu); - BUG_ON(!have_vcpu_info_placement); + if (other_cpu && + HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) + BUG(); } } @@ -1093,10 +1094,8 @@ asmlinkage void __init xen_start_kernel(void) __supported_pte_mask |= _PAGE_IOMAP; -#ifdef CONFIG_X86_64 /* Work out if we support NX */ - check_efer(); -#endif + x86_configure_nx(); xen_setup_features(); @@ -1178,10 +1177,16 @@ asmlinkage void __init xen_start_kernel(void) add_preferred_console("xenboot", 0, NULL); add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); + } else { + /* Make sure ACS will be enabled */ + pci_request_acs(); } + xen_raw_console_write("about to get started...\n"); + xen_setup_runstate_info(0); + /* Start the world */ #ifdef CONFIG_X86_32 i386_start_kernel(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3bf7b1d250c..bf4cd6bfe95 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -185,7 +185,7 @@ static inline unsigned p2m_index(unsigned long pfn) } /* Build the parallel p2m_top_mfn structures */ -static void __init xen_build_mfn_list_list(void) +void xen_build_mfn_list_list(void) { unsigned pfn, idx; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index fe03eeed7b4..64757c0ba5f 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -73,7 +73,7 @@ static __cpuinit void cpu_bringup(void) xen_setup_cpu_clockevents(); - cpu_set(cpu, cpu_online_map); + set_cpu_online(cpu, true); percpu_write(cpu_state, CPU_ONLINE); wmb(); @@ -295,6 +295,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) (unsigned long)task_stack_page(idle) - KERNEL_STACK_OFFSET + THREAD_SIZE; #endif + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_init_lock_cpu(cpu); diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 95be7b43472..987267f79bf 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -1,4 +1,5 @@ #include <linux/types.h> +#include <linux/clockchips.h> #include <xen/interface/xen.h> #include <xen/grant_table.h> @@ -27,6 +28,8 @@ void xen_pre_suspend(void) void xen_post_suspend(int suspend_cancelled) { + xen_build_mfn_list_list(); + xen_setup_shared_info(); if (suspend_cancelled) { @@ -44,7 +47,19 @@ void xen_post_suspend(int suspend_cancelled) } +static void xen_vcpu_notify_restore(void *data) +{ + unsigned long reason = (unsigned long)data; + + /* Boot processor notified via generic timekeeping_resume() */ + if ( smp_processor_id() == 0) + return; + + clockevents_notify(reason, NULL); +} + void xen_arch_resume(void) { - /* nothing */ + smp_call_function(xen_vcpu_notify_restore, + (void *)CLOCK_EVT_NOTIFY_RESUME, 1); } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 0a5aa44299a..9d1f853120d 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -100,7 +100,7 @@ bool xen_vcpu_stolen(int vcpu) return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; } -static void setup_runstate_info(int cpu) +void xen_setup_runstate_info(int cpu) { struct vcpu_register_runstate_memory_area area; @@ -434,7 +434,7 @@ void xen_setup_timer(int cpu) name = "<timer kasprintf failed>"; irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, name, NULL); evt = &per_cpu(xen_clock_events, cpu); @@ -442,8 +442,6 @@ void xen_setup_timer(int cpu) evt->cpumask = cpumask_of(cpu); evt->irq = irq; - - setup_runstate_info(cpu); } void xen_teardown_timer(int cpu) @@ -494,6 +492,7 @@ __init void xen_time_init(void) setup_force_cpu_cap(X86_FEATURE_TSC); + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_setup_cpu_clockevents(); } diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 02f496a8dba..53adefda427 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -96,7 +96,7 @@ ENTRY(xen_sysret32) pushq $__USER32_CS pushq %rcx - pushq $VGCF_in_syscall + pushq $0 1: jmp hypercall_iret ENDPATCH(xen_sysret32) RELOC(xen_sysret32, 1b+1) @@ -151,7 +151,7 @@ ENTRY(xen_syscall32_target) ENTRY(xen_sysenter_target) lea 16(%rsp), %rsp /* strip %rcx, %r11 */ mov $-ENOSYS, %rax - pushq $VGCF_in_syscall + pushq $0 jmp hypercall_iret ENDPROC(xen_syscall32_target) ENDPROC(xen_sysenter_target) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 355fa6b99c9..f9153a300bc 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -25,6 +25,7 @@ extern struct shared_info *HYPERVISOR_shared_info; void xen_setup_mfn_list_list(void); void xen_setup_shared_info(void); +void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_ident_map_ISA(void); @@ -41,6 +42,7 @@ void __init xen_build_dynamic_phys_to_machine(void); void xen_init_irq_ops(void); void xen_setup_timer(int cpu); +void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu); cycle_t xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); |