diff options
Diffstat (limited to 'arch/x86')
75 files changed, 2778 insertions, 1085 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 469f3450bf8..87717f3687d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -138,6 +138,9 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y +config HAVE_DYNAMIC_PER_CPU_AREA + def_bool y + config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP @@ -780,6 +783,11 @@ config X86_MCE_AMD Additional support for AMD specific MCE features such as the DRAM Error Threshold. +config X86_MCE_THRESHOLD + depends on X86_MCE_AMD || X86_MCE_INTEL + bool + default y + config X86_MCE_NONFATAL tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" depends on X86_32 && X86_MCE @@ -1125,7 +1133,7 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accomodate various tables. -config HAVE_ARCH_BOOTMEM_NODE +config HAVE_ARCH_BOOTMEM def_bool y depends on X86_32 && NUMA @@ -1423,7 +1431,7 @@ config CRASH_DUMP config KEXEC_JUMP bool "kexec jump (EXPERIMENTAL)" depends on EXPERIMENTAL - depends on KEXEC && HIBERNATION && X86_32 + depends on KEXEC && HIBERNATION ---help--- Jump between original kernel and kexeced kernel and invoke code in physical address mode via KEXEC diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 4ef949c1972..394d177d721 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -379,6 +379,7 @@ static inline u32 safe_apic_wait_icr_idle(void) static inline void ack_APIC_irq(void) { +#ifdef CONFIG_X86_LOCAL_APIC /* * ack_APIC_irq() actually gets compiled as a single instruction * ... yummie. @@ -386,6 +387,7 @@ static inline void ack_APIC_irq(void) /* Docs say use 0 for future compatibility */ apic_write(APIC_EOI, 0); +#endif } static inline unsigned default_get_apic_id(unsigned long x) diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 63134e31e8b..bc9514fb3b1 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -53,6 +53,7 @@ #define APIC_ESR_SENDILL 0x00020 #define APIC_ESR_RECVILL 0x00040 #define APIC_ESR_ILLREGA 0x00080 +#define APIC_LVTCMCI 0x2f0 #define APIC_ICR 0x300 #define APIC_DEST_SELF 0x40000 #define APIC_DEST_ALLINC 0x80000 diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 2f8466540fb..5b301b7ff5f 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -5,24 +5,43 @@ #include <linux/mm.h> /* Caches aren't brain-dead on the intel. */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define flush_dcache_page(page) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_range(start, end) do { } while (0) -#define flush_icache_page(vma, pg) do { } while (0) -#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) +static inline void flush_cache_all(void) { } +static inline void flush_cache_mm(struct mm_struct *mm) { } +static inline void flush_cache_dup_mm(struct mm_struct *mm) { } +static inline void flush_cache_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) { } +static inline void flush_cache_page(struct vm_area_struct *vma, + unsigned long vmaddr, unsigned long pfn) { } +static inline void flush_dcache_page(struct page *page) { } +static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } +static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } +static inline void flush_icache_range(unsigned long start, + unsigned long end) { } +static inline void flush_icache_page(struct vm_area_struct *vma, + struct page *page) { } +static inline void flush_icache_user_range(struct vm_area_struct *vma, + struct page *page, + unsigned long addr, + unsigned long len) { } +static inline void flush_cache_vmap(unsigned long start, unsigned long end) { } +static inline void flush_cache_vunmap(unsigned long start, + unsigned long end) { } -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ - memcpy((dst), (src), (len)) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy((dst), (src), (len)) +static inline void copy_to_user_page(struct vm_area_struct *vma, + struct page *page, unsigned long vaddr, + void *dst, const void *src, + unsigned long len) +{ + memcpy(dst, src, len); +} + +static inline void copy_from_user_page(struct vm_area_struct *vma, + struct page *page, unsigned long vaddr, + void *dst, const void *src, + unsigned long len) +{ + memcpy(dst, src, len); +} #define PG_non_WB PG_arch_1 PAGEFLAG(NonWB, non_WB) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index ca5ffb2856b..edc90f23e70 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -37,8 +37,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #else /* !CONFIG_X86_32 */ -#define MAX_EFI_IO_PAGES 100 - extern u64 efi_call0(void *fp); extern u64 efi_call1(void *fp, u64 arg1); extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 854d538ae85..c2e6bedaf25 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -33,6 +33,8 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7, smp_invalidate_interrupt) #endif +BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR) + /* * every pentium local APIC has two 'local interrupts', with a * soft-definable vector attached to both interrupts, one of diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index dca8f03da5b..63a79c77d22 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -24,9 +24,6 @@ #include <asm/kmap_types.h> #else #include <asm/vsyscall.h> -#ifdef CONFIG_EFI -#include <asm/efi.h> -#endif #endif /* @@ -92,13 +89,6 @@ enum fixed_addresses { FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, #endif -#ifdef CONFIG_X86_64 -#ifdef CONFIG_EFI - FIX_EFI_IO_MAP_LAST_PAGE, - FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE - + MAX_EFI_IO_PAGES - 1, -#endif -#endif #ifdef CONFIG_X86_VISWS_APIC FIX_CO_CPU, /* Cobalt timer */ FIX_CO_APIC, /* Cobalt APIC Redirection Table */ diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 176f058e715..039db6aa8e0 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -12,6 +12,7 @@ typedef struct { unsigned int apic_timer_irqs; /* arch dependent */ unsigned int irq_spurious_count; #endif + unsigned int generic_irqs; /* arch dependent */ #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 370e1c83bb4..b762ea49bd7 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -27,6 +27,7 @@ /* Interrupt handlers registered during init_IRQ */ extern void apic_timer_interrupt(void); +extern void generic_interrupt(void); extern void error_interrupt(void); extern void spurious_interrupt(void); extern void thermal_interrupt(void); diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 48f0004db8c..71c9e518398 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk) #else /* CONFIG_X86_32 */ -extern void finit(void); +#ifdef CONFIG_MATH_EMULATION +extern void finit_task(struct task_struct *tsk); +#else +static inline void finit_task(struct task_struct *tsk) +{ +} +#endif static inline void tolerant_fwait(void) { diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h new file mode 100644 index 00000000000..36fb1a6a510 --- /dev/null +++ b/arch/x86/include/asm/init.h @@ -0,0 +1,18 @@ +#ifndef _ASM_X86_INIT_32_H +#define _ASM_X86_INIT_32_H + +#ifdef CONFIG_X86_32 +extern void __init early_ioremap_page_table_range_init(void); +#endif + +extern unsigned long __init +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask); + + +extern unsigned long __initdata e820_table_start; +extern unsigned long __meminitdata e820_table_end; +extern unsigned long __meminitdata e820_table_top; + +#endif /* _ASM_X86_INIT_32_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 683d0b4c00f..e5383e3d2f8 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -172,8 +172,6 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) extern void iounmap(volatile void __iomem *addr); -extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); - #ifdef CONFIG_X86_32 # include "io_32.h" @@ -198,7 +196,6 @@ extern void early_ioremap_reset(void); extern void __iomem *early_ioremap(unsigned long offset, unsigned long size); extern void __iomem *early_memremap(unsigned long offset, unsigned long size); extern void early_iounmap(void __iomem *addr, unsigned long size); -extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); #define IO_SPACE_LIMIT 0xffff diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 107eb219669..f38481bcd45 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -36,6 +36,7 @@ static inline int irq_canonicalize(int irq) extern void fixup_irqs(void); #endif +extern void (*generic_interrupt_extension)(void); extern void init_IRQ(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 8a285f356f8..3cbd79bbb47 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -112,6 +112,11 @@ #define LOCAL_PERF_VECTOR 0xee /* + * Generic system vector for platform specific use + */ +#define GENERIC_INTERRUPT_VECTOR 0xed + +/* * First APIC vector available to drivers: (vectors 0x30-0xee) we * start at 0x31(0x41) to spread out vectors evenly between priority * levels. (0x80 is the syscall vector) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 0ceb6d19ed3..317ff1703d0 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -9,13 +9,13 @@ # define PAGES_NR 4 #else # define PA_CONTROL_PAGE 0 -# define PA_TABLE_PAGE 1 -# define PAGES_NR 2 +# define VA_CONTROL_PAGE 1 +# define PA_TABLE_PAGE 2 +# define PA_SWAP_PAGE 3 +# define PAGES_NR 4 #endif -#ifdef CONFIG_X86_32 # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 -#endif #ifndef __ASSEMBLY__ @@ -136,10 +136,11 @@ relocate_kernel(unsigned long indirection_page, unsigned int has_pae, unsigned int preserve_context); #else -NORET_TYPE void +unsigned long relocate_kernel(unsigned long indirection_page, unsigned long page_list, - unsigned long start_address) ATTRIB_NORET; + unsigned long start_address, + unsigned int preserve_context); #endif #define ARCH_HAS_KIMAGE_ARCH diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 9320e2a8a26..a0d70b46c27 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -4,11 +4,6 @@ #undef notrace #define notrace __attribute__((no_instrument_function)) -#ifdef CONFIG_X86_64 -#define __ALIGN .p2align 4,,15 -#define __ALIGN_STR ".p2align 4,,15" -#endif - #ifdef CONFIG_X86_32 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) /* @@ -50,16 +45,25 @@ __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ "g" (arg4), "g" (arg5), "g" (arg6)) -#endif +#endif /* CONFIG_X86_32 */ + +#ifdef __ASSEMBLY__ #define GLOBAL(name) \ .globl name; \ name: +#ifdef CONFIG_X86_64 +#define __ALIGN .p2align 4,,15 +#define __ALIGN_STR ".p2align 4,,15" +#endif + #ifdef CONFIG_X86_ALIGNMENT_16 #define __ALIGN .align 16,0x90 #define __ALIGN_STR ".align 16,0x90" #endif +#endif /* __ASSEMBLY__ */ + #endif /* _ASM_X86_LINKAGE_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 32c6e17b960..563933e06a3 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -11,6 +11,8 @@ */ #define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ +#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ +#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ #define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ #define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ @@ -90,14 +92,29 @@ extern int mce_disabled; #include <asm/atomic.h> +void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, device_mce); extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); +/* + * To support more than 128 would need to escape the predefined + * Linux defined extended banks first. + */ +#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) + #ifdef CONFIG_X86_MCE_INTEL void mce_intel_feature_init(struct cpuinfo_x86 *c); +void cmci_clear(void); +void cmci_reenable(void); +void cmci_rediscover(int dying); +void cmci_recheck(void); #else static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } +static inline void cmci_clear(void) {} +static inline void cmci_reenable(void) {} +static inline void cmci_rediscover(int dying) {} +static inline void cmci_recheck(void) {} #endif #ifdef CONFIG_X86_MCE_AMD @@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c); static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif -void mce_log_therm_throt_event(unsigned int cpu, __u64 status); +extern int mce_available(struct cpuinfo_x86 *c); + +void mce_log_therm_throt_event(__u64 status); extern atomic_t mce_entry; extern void do_machine_check(struct pt_regs *, long); + +typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); +DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); + +enum mcp_flags { + MCP_TIMESTAMP = (1 << 0), /* log time stamp */ + MCP_UC = (1 << 1), /* log uncorrected errors */ +}; +extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); + extern int mce_notify_user(void); #endif /* !CONFIG_X86_32 */ @@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c); #else #define mcheck_init(c) do { } while (0) #endif -extern void stop_mce(void); -extern void restart_mce(void); + +extern void (*mce_threshold_vector)(void); #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 105fb90a063..ede6998bd92 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -91,46 +91,9 @@ static inline int pfn_valid(int pfn) #endif /* CONFIG_DISCONTIGMEM */ #ifdef CONFIG_NEED_MULTIPLE_NODES - -/* - * Following are macros that are specific to this numa platform. - */ -#define reserve_bootmem(addr, size, flags) \ - reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags)) -#define alloc_bootmem(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_nopanic(x) \ - __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ - __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0) -#define alloc_bootmem_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_pages_nopanic(x) \ - __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \ - __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) -#define alloc_bootmem_node(pgdat, x) \ -({ \ - struct pglist_data __maybe_unused \ - *__alloc_bootmem_node__pgdat = (pgdat); \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ - __pa(MAX_DMA_ADDRESS)); \ -}) -#define alloc_bootmem_pages_node(pgdat, x) \ -({ \ - struct pglist_data __maybe_unused \ - *__alloc_bootmem_node__pgdat = (pgdat); \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \ - __pa(MAX_DMA_ADDRESS)); \ -}) -#define alloc_bootmem_low_pages_node(pgdat, x) \ -({ \ - struct pglist_data __maybe_unused \ - *__alloc_bootmem_node__pgdat = (pgdat); \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \ -}) +/* always use node 0 for bootmem on this numa platform */ +#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit) \ + (NODE_DATA(0)->bdata) #endif /* CONFIG_NEED_MULTIPLE_NODES */ #endif /* _ASM_X86_MMZONE_32_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 358acc59ae0..2dbd2314139 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -77,6 +77,11 @@ #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +/* These are consecutive and not in the normal 4er MCE bank block */ +#define MSR_IA32_MC0_CTL2 0x00000280 +#define CMCI_EN (1ULL << 30) +#define CMCI_THRESHOLD_MASK 0xffffULL + #define MSR_P6_PERFCTR0 0x000000c1 #define MSR_P6_PERFCTR1 0x000000c2 #define MSR_P6_EVNTSEL0 0x00000186 diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 2d625da6603..826ad37006a 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -40,14 +40,8 @@ #ifndef __ASSEMBLY__ -struct pgprot; - extern int page_is_ram(unsigned long pagenr); extern int devmem_is_allowed(unsigned long pagenr); -extern void map_devmem(unsigned long pfn, unsigned long size, - struct pgprot vma_prot); -extern void unmap_devmem(unsigned long pfn, unsigned long size, - struct pgprot vma_prot); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index b0e70056838..2cd07b9422f 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -2,6 +2,7 @@ #define _ASM_X86_PAT_H #include <linux/types.h> +#include <asm/pgtable_types.h> #ifdef CONFIG_X86_PAT extern int pat_enabled; @@ -17,5 +18,9 @@ extern int free_memtype(u64 start, u64 end); extern int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flag); +extern void map_devmem(unsigned long pfn, unsigned long size, + struct pgprot vma_prot); +extern void unmap_devmem(unsigned long pfn, unsigned long size, + struct pgprot vma_prot); #endif /* _ASM_X86_PAT_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index aee103b26d0..8f1d2fbec1d 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -43,6 +43,14 @@ #else /* ...!ASSEMBLY */ #include <linux/stringify.h> +#include <asm/sections.h> + +#define __addr_to_pcpu_ptr(addr) \ + (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ + + (unsigned long)__per_cpu_start) +#define __pcpu_ptr_to_addr(ptr) \ + (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ + - (unsigned long)__per_cpu_start) #ifdef CONFIG_SMP #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1c097a3a666..d0812e155f1 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -288,6 +288,8 @@ static inline int is_new_memtype_allowed(unsigned long flags, return 1; } +pmd_t *populate_extra_pmd(unsigned long vaddr); +pte_t *populate_extra_pte(unsigned long vaddr); #endif /* __ASSEMBLY__ */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index bd8df3b2fe0..2733fad45f9 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -25,6 +25,11 @@ * area for the same reason. ;) */ #define VMALLOC_OFFSET (8 * 1024 * 1024) + +#ifndef __ASSEMBLER__ +extern bool __vmalloc_start_set; /* set once high_memory is set */ +#endif + #define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET) #ifdef CONFIG_X86_PAE #define LAST_PKMAP 512 diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 4d258ad76a0..b8238dc8786 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -273,6 +273,7 @@ typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; extern int nx_enabled; +extern void set_nx(void); #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 777327ef05c..9f4dfba33b2 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -199,6 +199,10 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define SCIR_CPU_ACTIVITY 0x02 /* not idle */ #define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */ +/* Loop through all installed blades */ +#define for_each_possible_blade(bid) \ + for ((bid) = 0; (bid) < uv_num_possible_blades(); (bid)++) + /* * Macros for converting between kernel virtual addresses, socket local physical * addresses, and UV global physical addresses. diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 4bd990ee43d..1a918dde46b 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x) xmaddr_t arbitrary_virt_to_machine(void *address); +unsigned long arbitrary_virt_to_mfn(void *vaddr); void make_lowmem_page_readonly(void *vaddr); void make_lowmem_page_readwrite(void *vaddr); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 95f216bbfaf..339ce35648e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -111,7 +111,7 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64 ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) - obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o + obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 6907b8e85d5..4c80f155743 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -414,9 +414,17 @@ void __init alternative_instructions(void) that might execute the to be patched code. Other CPUs are not running. */ stop_nmi(); -#ifdef CONFIG_X86_MCE - stop_mce(); -#endif + + /* + * Don't stop machine check exceptions while patching. + * MCEs only happen when something got corrupted and in this + * case we must do something about the corruption. + * Ignoring it is worse than a unlikely patching race. + * Also machine checks tend to be broadcast and if one CPU + * goes into machine check the others follow quickly, so we don't + * expect a machine check to cause undue problems during to code + * patching. + */ apply_alternatives(__alt_instructions, __alt_instructions_end); @@ -456,9 +464,6 @@ void __init alternative_instructions(void) (unsigned long)__smp_locks_end); restart_nmi(); -#ifdef CONFIG_X86_MCE - restart_mce(); -#endif } /** diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f9cecdfd05c..30909a258d0 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -46,6 +46,7 @@ #include <asm/idle.h> #include <asm/mtrr.h> #include <asm/smp.h> +#include <asm/mce.h> unsigned int num_processors; @@ -842,6 +843,14 @@ void clear_local_APIC(void) apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); } #endif +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 6) { + v = apic_read(APIC_LVTCMCI); + if (!(v & APIC_LVT_MASKED)) + apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED); + } +#endif + /* * Clean APIC state for other OSs: */ @@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_LVT1, value); preempt_enable(); + +#ifdef CONFIG_X86_MCE_INTEL + /* Recheck CMCI information after local APIC is up on CPU #0 */ + if (smp_processor_id() == 0) + cmci_recheck(); +#endif } void __cpuinit end_local_APIC_setup(void) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 25423a5b80e..f47df59016c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -5,6 +5,7 @@ #include <asm/io.h> #include <asm/processor.h> #include <asm/apic.h> +#include <asm/cpu.h> #ifdef CONFIG_X86_64 # include <asm/numa_64.h> @@ -141,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) } } +static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + /* calling is from identify_secondary_cpu() ? */ + if (c->cpu_index == boot_cpu_id) + return; + + /* + * Certain Athlons might work (for various values of 'work') in SMP + * but they are not certified as MP capable. + */ + /* Athlon 660/661 is valid. */ + if ((c->x86_model == 6) && ((c->x86_mask == 0) || + (c->x86_mask == 1))) + goto valid_k7; + + /* Duron 670 is valid */ + if ((c->x86_model == 7) && (c->x86_mask == 0)) + goto valid_k7; + + /* + * Athlon 662, Duron 671, and Athlon >model 7 have capability + * bit. It's worth noting that the A5 stepping (662) of some + * Athlon XP's have the MP bit set. + * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for + * more. + */ + if (((c->x86_model == 6) && (c->x86_mask >= 2)) || + ((c->x86_model == 7) && (c->x86_mask >= 1)) || + (c->x86_model > 7)) + if (cpu_has_mp) + goto valid_k7; + + /* If we get here, not a certified SMP capable AMD system. */ + + /* + * Don't taint if we are running SMP kernel on a single non-MP + * approved Athlon + */ + WARN_ONCE(1, "WARNING: This combination of AMD" + "processors is not suitable for SMP.\n"); + if (!test_taint(TAINT_UNSAFE_SMP)) + add_taint(TAINT_UNSAFE_SMP); + +valid_k7: + ; +#endif +} + static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) { u32 l, h; @@ -175,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) } set_cpu_cap(c, X86_FEATURE_K7); + + amd_k7_smp_check(c); } #endif diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 4b1c319d30c..22590cf688a 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (!data) return -ENOMEM; - data->acpi_data = percpu_ptr(acpi_perf_data, cpu); + data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); per_cpu(drv_data, cpu) = data; if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index b585e04cbc9..3178c3acd97 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -277,7 +277,6 @@ static struct cpufreq_driver p4clockmod_driver = { .name = "p4-clockmod", .owner = THIS_MODULE, .attr = p4clockmod_attr, - .hide_interface = 1, }; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 25c559ba8d5..191117f1ad5 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -13,6 +13,7 @@ #include <asm/uaccess.h> #include <asm/ds.h> #include <asm/bugs.h> +#include <asm/cpu.h> #ifdef CONFIG_X86_64 #include <asm/topology.h> @@ -110,6 +111,28 @@ static void __cpuinit trap_init_f00f_bug(void) } #endif +static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + /* calling is from identify_secondary_cpu() ? */ + if (c->cpu_index == boot_cpu_id) + return; + + /* + * Mask B, Pentium, but not Pentium MMX + */ + if (c->x86 == 5 && + c->x86_mask >= 1 && c->x86_mask <= 4 && + c->x86_model <= 3) { + /* + * Remember we have B step Pentia with bugs + */ + WARN_ONCE(1, "WARNING: SMP operation may be unreliable" + "with B stepping processors.\n"); + } +#endif +} + static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) { unsigned long lo, hi; @@ -186,6 +209,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_NUMAQ numaq_tsc_disable(); #endif + + intel_smp_check(c); } #else static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index d7d2323bbb6..b2f89829bbe 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o +obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index dfaebce3633..3552119b091 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c) } } -static unsigned long old_cr4 __initdata; - -void __init stop_mce(void) -{ - old_cr4 = read_cr4(); - clear_in_cr4(X86_CR4_MCE); -} - -void __init restart_mce(void) -{ - if (old_cr4 & X86_CR4_MCE) - set_in_cr4(X86_CR4_MCE); -} - static int __init mcheck_disable(char *str) { mce_disabled = 1; diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index fe79985ce0f..ca14604611e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -3,6 +3,8 @@ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. * Rest from unknown author(s). * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen */ #include <linux/init.h> @@ -24,6 +26,9 @@ #include <linux/ctype.h> #include <linux/kmod.h> #include <linux/kdebug.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/ratelimit.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> @@ -32,7 +37,6 @@ #include <asm/idle.h> #define MISC_MCELOG_MINOR 227 -#define NR_SYSFS_BANKS 6 atomic_t mce_entry; @@ -47,7 +51,7 @@ static int mce_dont_init; */ static int tolerant = 1; static int banks; -static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; +static u64 *bank; static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; @@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL }; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +/* MCA banks polled by the period polling timer for corrected events */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { + [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + m->cpu = smp_processor_id(); + rdtscll(m->tsc); +} + /* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also @@ -119,11 +136,11 @@ static void print_mce(struct mce *m) print_symbol("{%s}", m->ip); printk("\n"); } - printk(KERN_EMERG "TSC %Lx ", m->tsc); + printk(KERN_EMERG "TSC %llx ", m->tsc); if (m->addr) - printk("ADDR %Lx ", m->addr); + printk("ADDR %llx ", m->addr); if (m->misc) - printk("MISC %Lx ", m->misc); + printk("MISC %llx ", m->misc); printk("\n"); printk(KERN_EMERG "This is not a software problem!\n"); printk(KERN_EMERG "Run through mcelog --ascii to decode " @@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) panic(msg); } -static int mce_available(struct cpuinfo_x86 *c) +int mce_available(struct cpuinfo_x86 *c) { + if (mce_dont_init) + return 0; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } @@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } /* - * The actual machine check handler + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + */ +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +{ + struct mce m; + int i; + + mce_setup(&m); + + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + for (i = 0; i < banks; i++) { + if (!bank[i] || !test_bit(i, *b)) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + m.tsc = 0; + + barrier(); + rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + if (!(m.status & MCI_STATUS_VAL)) + continue; + + /* + * Uncorrected events are handled by the exception handler + * when it is enabled. But when the exception is disabled log + * everything. + * + * TBD do the same check for MCI_STATUS_EN here? + */ + if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) + continue; + + if (m.status & MCI_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + + if (!(flags & MCP_TIMESTAMP)) + m.tsc = 0; + /* + * Don't get the IP here because it's unlikely to + * have anything to do with the actual error location. + */ + + mce_log(&m); + add_taint(TAINT_MACHINE_CHECK); + + /* + * Clear state for this bank. + */ + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } + + /* + * Don't clear MCG_STATUS here because it's only defined for + * exceptions. + */ +} + +/* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. + * + * This is executed in NMI context not subject to normal locking rules. This + * implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there! */ void do_machine_check(struct pt_regs * regs, long error_code) { @@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code) * error. */ int kill_it = 0; + DECLARE_BITMAP(toclear, MAX_NR_BANKS); atomic_inc(&mce_entry); - if ((regs - && notify_die(DIE_NMI, "machine check", regs, error_code, + if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) - || !banks) + goto out2; + if (!banks) goto out2; - memset(&m, 0, sizeof(struct mce)); - m.cpu = smp_processor_id(); + mce_setup(&m); + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) @@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code) barrier(); for (i = 0; i < banks; i++) { - if (i < NR_SYSFS_BANKS && !bank[i]) + __clear_bit(i, toclear); + if (!bank[i]) continue; m.misc = 0; m.addr = 0; m.bank = i; - m.tsc = 0; rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); if ((m.status & MCI_STATUS_VAL) == 0) continue; + /* + * Non uncorrected errors are handled by machine_check_poll + * Leave them alone. + */ + if ((m.status & MCI_STATUS_UC) == 0) + continue; + + /* + * Set taint even when machine check was not enabled. + */ + add_taint(TAINT_MACHINE_CHECK); + + __set_bit(i, toclear); + if (m.status & MCI_STATUS_EN) { /* if PCC was set, there's no way out */ no_way_out |= !!(m.status & MCI_STATUS_PCC); @@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) no_way_out = 1; kill_it = 1; } + } else { + /* + * Machine check event was not enabled. Clear, but + * ignore. + */ + continue; } if (m.status & MCI_STATUS_MISCV) @@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); mce_get_rip(&m, regs); - if (error_code >= 0) - rdtscll(m.tsc); - if (error_code != -2) - mce_log(&m); + mce_log(&m); /* Did this bank cause the exception? */ /* Assume that the bank with uncorrectable errors did it, @@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) panicm = m; panicm_found = 1; } - - add_taint(TAINT_MACHINE_CHECK); } - /* Never do anything final in the polling timer */ - if (!regs) - goto out; - /* If we didn't find an uncorrectable error, pick the last one (shouldn't happen, just being safe). */ if (!panicm_found) @@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code) /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); - out: /* the last thing we do is clear state */ - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + for (i = 0; i < banks; i++) { + if (test_bit(i, toclear)) + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); @@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code) * and historically has been the register value of the * MSR_IA32_THERMAL_STATUS (Intel) msr. */ -void mce_log_therm_throt_event(unsigned int cpu, __u64 status) +void mce_log_therm_throt_event(__u64 status) { struct mce m; - memset(&m, 0, sizeof(m)); - m.cpu = cpu; + mce_setup(&m); m.bank = MCE_THERMAL_BANK; m.status = status; - rdtscll(m.tsc); mce_log(&m); } #endif /* CONFIG_X86_MCE_INTEL */ @@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status) static int check_interval = 5 * 60; /* 5 minutes */ static int next_interval; /* in jiffies */ -static void mcheck_timer(struct work_struct *work); -static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); +static void mcheck_timer(unsigned long); +static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mcheck_check_cpu(void *info) +static void mcheck_timer(unsigned long data) { - if (mce_available(¤t_cpu_data)) - do_machine_check(NULL, 0); -} + struct timer_list *t = &per_cpu(mce_timer, data); -static void mcheck_timer(struct work_struct *work) -{ - on_each_cpu(mcheck_check_cpu, NULL, 1); + WARN_ON(smp_processor_id() != data); + + if (mce_available(¤t_cpu_data)) + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work) (int)round_jiffies_relative(check_interval*HZ)); } - schedule_delayed_work(&mcheck_work, next_interval); + t->expires = jiffies + next_interval; + add_timer(t); +} + +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); } +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + /* - * This is only called from process context. This is where we do - * anything we need to alert userspace about new MCEs. This is called - * directly from the poller and also from entry.S and idle, thanks to - * TIF_MCE_NOTIFY. + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. */ int mce_notify_user(void) { + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + clear_thread_flag(TIF_MCE_NOTIFY); if (test_and_clear_bit(0, ¬ify_user)) { - static unsigned long last_print; - unsigned long now = jiffies; - wake_up_interruptible(&mce_wait); - if (trigger[0]) - call_usermodehelper(trigger, trigger_argv, NULL, - UMH_NO_WAIT); - if (time_after_eq(now, last_print + (check_interval*HZ))) { - last_print = now; + /* + * There is no risk of missing notifications because + * work_pending is always cleared before the function is + * executed. + */ + if (trigger[0] && !work_pending(&mce_trigger_work)) + schedule_work(&mce_trigger_work); + + if (__ratelimit(&ratelimit)) printk(KERN_INFO "Machine check events logged\n"); - } return 1; } @@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = { static __init int periodic_mcheck_init(void) { - next_interval = check_interval * HZ; - if (next_interval) - schedule_delayed_work(&mcheck_work, - round_jiffies_relative(next_interval)); - idle_notifier_register(&mce_idle_notifier); - return 0; + idle_notifier_register(&mce_idle_notifier); + return 0; } __initcall(periodic_mcheck_init); - /* * Initialize Machine Checks for a CPU. */ -static void mce_init(void *dummy) +static int mce_cap_init(void) { u64 cap; - int i; + unsigned b; rdmsrl(MSR_IA32_MCG_CAP, cap); - banks = cap & 0xff; - if (banks > MCE_EXTENDED_BANK) { - banks = MCE_EXTENDED_BANK; - printk(KERN_INFO "MCE: warning: using only %d banks\n", - MCE_EXTENDED_BANK); + b = cap & 0xff; + if (b > MAX_NR_BANKS) { + printk(KERN_WARNING + "MCE: Using only %u machine check banks out of %u\n", + MAX_NR_BANKS, b); + b = MAX_NR_BANKS; } + + /* Don't support asymmetric configurations today */ + WARN_ON(banks != 0 && b != banks); + banks = b; + if (!bank) { + bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); + if (!bank) + return -ENOMEM; + memset(bank, 0xff, banks * sizeof(u64)); + } + /* Use accurate RIP reporting if available. */ if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) rip_msr = MSR_IA32_MCG_EIP; - /* Log the machine checks left over from the previous reset. - This also clears all registers */ - do_machine_check(NULL, mce_bootlog ? -1 : -2); + return 0; +} + +static void mce_init(void *dummy) +{ + u64 cap; + int i; + mce_banks_t all_banks; + + /* + * Log the machine checks left over from the previous reset. + */ + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC, &all_banks); set_in_cr4(X86_CR4_MCE); + rdmsrl(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - if (i < NR_SYSFS_BANKS) - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); - else - wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); - + wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } /* Add per CPU specific workarounds here */ -static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) +static void mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if(c->x86 == 15) + if (c->x86 == 15 && banks > 4) /* disable GART TBL walk error reporting, which trips off incorrectly with the IOMMU & 3ware & Cerberus. */ - clear_bit(10, &bank[4]); + clear_bit(10, (unsigned long *)&bank[4]); if(c->x86 <= 17 && mce_bootlog < 0) /* Lots of broken BIOS around that don't clear them by default and leave crap in there. Don't log. */ @@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) } } +static void mce_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + + /* data race harmless because everyone sets to the same value */ + if (!next_interval) + next_interval = check_interval * HZ; + if (!next_interval) + return; + setup_timer(t, mcheck_timer, smp_processor_id()); + t->expires = round_jiffies(jiffies + next_interval); + add_timer(t); +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off. */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { - mce_cpu_quirks(c); + if (!mce_available(c)) + return; - if (mce_dont_init || - !mce_available(c)) + if (mce_cap_init() < 0) { + mce_dont_init = 1; return; + } + mce_cpu_quirks(c); mce_init(NULL); mce_cpu_features(c); + mce_init_timer(); } /* @@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, { unsigned long *cpu_tsc; static DEFINE_MUTEX(mce_read_mutex); - unsigned next; + unsigned prev, next; char __user *buf = ubuf; int i, err; @@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, } err = 0; - for (i = 0; i < next; i++) { - unsigned long start = jiffies; - - while (!mcelog.entry[i].finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(mcelog.entry + i,0, sizeof(struct mce)); - goto timeout; + prev = 0; + do { + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + + while (!mcelog.entry[i].finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(mcelog.entry + i, 0, + sizeof(struct mce)); + goto timeout; + } + cpu_relax(); } - cpu_relax(); + smp_rmb(); + err |= copy_to_user(buf, mcelog.entry + i, + sizeof(struct mce)); + buf += sizeof(struct mce); +timeout: + ; } - smp_rmb(); - err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); - buf += sizeof(struct mce); - timeout: - ; - } - memset(mcelog.entry, 0, next * sizeof(struct mce)); - mcelog.next = 0; + memset(mcelog.entry + prev, 0, + (next - prev) * sizeof(struct mce)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); synchronize_sched(); @@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = { &mce_chrdev_ops, }; -static unsigned long old_cr4 __initdata; - -void __init stop_mce(void) -{ - old_cr4 = read_cr4(); - clear_in_cr4(X86_CR4_MCE); -} - -void __init restart_mce(void) -{ - if (old_cr4 & X86_CR4_MCE) - set_in_cr4(X86_CR4_MCE); -} - /* * Old style boot options parsing. Only for compatibility. */ @@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str) return 1; } -/* mce=off disables machine check. Note you can re-enable it later - using sysfs. +/* mce=off disables machine check. mce=TOLERANCELEVEL (number, see above) mce=bootlog Log MCEs from before booting. Disabled by default on AMD. mce=nobootlog Don't log MCEs from before booting. */ @@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable); * Sysfs support */ +/* + * Disable machine checks on suspend and shutdown. We can't really handle + * them later. + */ +static int mce_disable(void) +{ + int i; + + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + return 0; +} + +static int mce_suspend(struct sys_device *dev, pm_message_t state) +{ + return mce_disable(); +} + +static int mce_shutdown(struct sys_device *dev) +{ + return mce_disable(); +} + /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. Only one CPU is active at this time, the others get readded later using CPU hotplug. */ @@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev) return 0; } +static void mce_cpu_restart(void *data) +{ + del_timer_sync(&__get_cpu_var(mce_timer)); + if (mce_available(¤t_cpu_data)) + mce_init(NULL); + mce_init_timer(); +} + /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { - if (next_interval) - cancel_delayed_work(&mcheck_work); - /* Timer race is harmless here */ - on_each_cpu(mce_init, NULL, 1); next_interval = check_interval * HZ; - if (next_interval) - schedule_delayed_work(&mcheck_work, - round_jiffies_relative(next_interval)); + on_each_cpu(mce_cpu_restart, NULL, 1); } static struct sysdev_class mce_sysclass = { + .suspend = mce_suspend, + .shutdown = mce_shutdown, .resume = mce_resume, .name = "machinecheck", }; @@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); -/* - * TBD should generate these dynamically based on number of available banks. - * Have only 6 contol banks in /sysfs until then. - */ -ACCESSOR(bank0ctl,bank[0],mce_restart()) -ACCESSOR(bank1ctl,bank[1],mce_restart()) -ACCESSOR(bank2ctl,bank[2],mce_restart()) -ACCESSOR(bank3ctl,bank[3],mce_restart()) -ACCESSOR(bank4ctl,bank[4],mce_restart()) -ACCESSOR(bank5ctl,bank[5],mce_restart()) +static struct sysdev_attribute *bank_attrs; + +static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, + char *buf) +{ + u64 b = bank[attr - bank_attrs]; + return sprintf(buf, "%llx\n", b); +} + +static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf, size_t siz) +{ + char *end; + u64 new = simple_strtoull(buf, &end, 0); + if (end == buf) + return -EINVAL; + bank[attr - bank_attrs] = new; + mce_restart(); + return end-buf; +} static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) @@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); ACCESSOR(check_interval,check_interval,mce_restart()) static struct sysdev_attribute *mce_attributes[] = { - &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, - &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL }; @@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu) if (err) goto error; } + for (i = 0; i < banks; i++) { + err = sysdev_create_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + if (err) + goto error2; + } cpu_set(cpu, mce_device_initialized); return 0; +error2: + while (--i >= 0) { + sysdev_remove_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + } error: - while (i--) { + while (--i >= 0) { sysdev_remove_file(&per_cpu(device_mce,cpu), mce_attributes[i]); } @@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu) for (i = 0; mce_attributes[i]; i++) sysdev_remove_file(&per_cpu(device_mce,cpu), mce_attributes[i]); + for (i = 0; i < banks; i++) + sysdev_remove_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); sysdev_unregister(&per_cpu(device_mce,cpu)); cpu_clear(cpu, mce_device_initialized); } +/* Make sure there are no machine checks on offlined CPUs. */ +static void mce_disable_cpu(void *h) +{ + int i; + unsigned long action = *(unsigned long *)h; + + if (!mce_available(¤t_cpu_data)) + return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_clear(); + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); +} + +static void mce_reenable_cpu(void *h) +{ + int i; + unsigned long action = *(unsigned long *)h; + + if (!mce_available(¤t_cpu_data)) + return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_reenable(); + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); +} + /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; + struct timer_list *t = &per_cpu(mce_timer, cpu); switch (action) { case CPU_ONLINE: @@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, threshold_cpu_callback(action, cpu); mce_remove_device(cpu); break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + del_timer_sync(t); + smp_call_function_single(cpu, mce_disable_cpu, &action, 1); + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + t->expires = round_jiffies(jiffies + next_interval); + add_timer_on(t, cpu); + smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + break; + case CPU_POST_DEAD: + /* intentionally ignoring frozen here */ + cmci_rediscover(cpu); + break; } return NOTIFY_OK; } @@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; +static __init int mce_init_banks(void) +{ + int i; + + bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, + GFP_KERNEL); + if (!bank_attrs) + return -ENOMEM; + + for (i = 0; i < banks; i++) { + struct sysdev_attribute *a = &bank_attrs[i]; + a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); + if (!a->attr.name) + goto nomem; + a->attr.mode = 0644; + a->show = show_bank; + a->store = set_bank; + } + return 0; + +nomem: + while (--i >= 0) + kfree(bank_attrs[i].attr.name); + kfree(bank_attrs); + bank_attrs = NULL; + return -ENOMEM; +} + static __init int mce_init_device(void) { int err; @@ -906,6 +1161,11 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; + + err = mce_init_banks(); + if (err) + return err; + err = sysdev_class_register(&mce_sysclass); if (err) return err; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 9817506dd46..c5a32f92d07 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = { static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ +static void amd_threshold_interrupt(void); + /* * CPU Initialization */ @@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) tr.reset = 0; tr.old_limit = 0; threshold_restart_bank(&tr); + + mce_threshold_vector = amd_threshold_interrupt; } } } @@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) * the interrupt goes off when error_count reaches threshold_limit. * the handler will simply log mcelog w/ software defined bank number. */ -asmlinkage void mce_threshold_interrupt(void) +static void amd_threshold_interrupt(void) { unsigned int bank, block; struct mce m; u32 low = 0, high = 0, address = 0; - ack_APIC_irq(); - exit_idle(); - irq_enter(); - - memset(&m, 0, sizeof(m)); - rdtscll(m.tsc); - m.cpu = smp_processor_id(); + mce_setup(&m); /* assume first bank caused it */ for (bank = 0; bank < NR_BANKS; ++bank) { @@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void) /* Log the machine check that caused the threshold event. */ - do_machine_check(NULL, 0); + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); @@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void) + bank * NR_BLOCKS + block; mce_log(&m); - goto out; + return; } } } -out: - inc_irq_stat(irq_threshold_count); - irq_exit(); } /* diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index aa5e287c98e..aaa7d973093 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -1,6 +1,8 @@ /* * Intel specific MCE features. * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> + * Copyright (C) 2008, 2009 Intel Corporation + * Author: Andi Kleen */ #include <linux/init.h> @@ -13,6 +15,7 @@ #include <asm/hw_irq.h> #include <asm/idle.h> #include <asm/therm_throt.h> +#include <asm/apic.h> asmlinkage void smp_thermal_interrupt(void) { @@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void) rdmsrl(MSR_IA32_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & 1)) - mce_log_therm_throt_event(smp_processor_id(), msr_val); + mce_log_therm_throt_event(msr_val); inc_irq_stat(irq_thermal_count); irq_exit(); @@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) return; } +/* + * Support for Intel Correct Machine Check Interrupts. This allows + * the CPU to raise an interrupt when a corrected machine check happened. + * Normally we pick those up using a regular polling timer. + * Also supports reliable discovery of shared banks. + */ + +static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); + +/* + * cmci_discover_lock protects against parallel discovery attempts + * which could race against each other. + */ +static DEFINE_SPINLOCK(cmci_discover_lock); + +#define CMCI_THRESHOLD 1 + +static int cmci_supported(int *banks) +{ + u64 cap; + + /* + * Vendor check is not strictly needed, but the initial + * initialization is vendor keyed and this + * makes sure none of the backdoors are entered otherwise. + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + if (!cpu_has_apic || lapic_get_maxlvt() < 6) + return 0; + rdmsrl(MSR_IA32_MCG_CAP, cap); + *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); + return !!(cap & MCG_CMCI_P); +} + +/* + * The interrupt handler. This is called on every event. + * Just call the poller directly to log any events. + * This could in theory increase the threshold under high load, + * but doesn't for now. + */ +static void intel_threshold_interrupt(void) +{ + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + mce_notify_user(); +} + +static void print_update(char *type, int *hdr, int num) +{ + if (*hdr == 0) + printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); + *hdr = 1; + printk(KERN_CONT " %s:%d", type, num); +} + +/* + * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks + * on this CPU. Use the algorithm recommended in the SDM to discover shared + * banks. + */ +static void cmci_discover(int banks, int boot) +{ + unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); + int hdr = 0; + int i; + + spin_lock(&cmci_discover_lock); + for (i = 0; i < banks; i++) { + u64 val; + + if (test_bit(i, owned)) + continue; + + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + + /* Already owned by someone else? */ + if (val & CMCI_EN) { + if (test_and_clear_bit(i, owned) || boot) + print_update("SHD", &hdr, i); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + continue; + } + + val |= CMCI_EN | CMCI_THRESHOLD; + wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + + /* Did the enable bit stick? -- the bank supports CMCI */ + if (val & CMCI_EN) { + if (!test_and_set_bit(i, owned) || boot) + print_update("CMCI", &hdr, i); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + } else { + WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); + } + } + spin_unlock(&cmci_discover_lock); + if (hdr) + printk(KERN_CONT "\n"); +} + +/* + * Just in case we missed an event during initialization check + * all the CMCI owned banks. + */ +void cmci_recheck(void) +{ + unsigned long flags; + int banks; + + if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) + return; + local_irq_save(flags); + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + local_irq_restore(flags); +} + +/* + * Disable CMCI on this CPU for all banks it owns when it goes down. + * This allows other CPUs to claim the banks on rediscovery. + */ +void cmci_clear(void) +{ + int i; + int banks; + u64 val; + + if (!cmci_supported(&banks)) + return; + spin_lock(&cmci_discover_lock); + for (i = 0; i < banks; i++) { + if (!test_bit(i, __get_cpu_var(mce_banks_owned))) + continue; + /* Disable CMCI */ + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); + wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + __clear_bit(i, __get_cpu_var(mce_banks_owned)); + } + spin_unlock(&cmci_discover_lock); +} + +/* + * After a CPU went down cycle through all the others and rediscover + * Must run in process context. + */ +void cmci_rediscover(int dying) +{ + int banks; + int cpu; + cpumask_var_t old; + + if (!cmci_supported(&banks)) + return; + if (!alloc_cpumask_var(&old, GFP_KERNEL)) + return; + cpumask_copy(old, ¤t->cpus_allowed); + + for_each_online_cpu (cpu) { + if (cpu == dying) + continue; + if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu))) + continue; + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks, 0); + } + + set_cpus_allowed_ptr(current, old); + free_cpumask_var(old); +} + +/* + * Reenable CMCI on this CPU in case a CPU down failed. + */ +void cmci_reenable(void) +{ + int banks; + if (cmci_supported(&banks)) + cmci_discover(banks, 0); +} + +static __cpuinit void intel_init_cmci(void) +{ + int banks; + + if (!cmci_supported(&banks)) + return; + + mce_threshold_vector = intel_threshold_interrupt; + cmci_discover(banks, 1); + /* + * For CPU #0 this runs with still disabled APIC, but that's + * ok because only the vector is set up. We still do another + * check for the banks later for CPU #0 just to make sure + * to not miss any events. + */ + apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); + cmci_recheck(); +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); + intel_init_cmci(); } diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c new file mode 100644 index 00000000000..23ee9e730f7 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -0,0 +1,29 @@ +/* + * Common corrected MCE threshold handler code: + */ +#include <linux/interrupt.h> +#include <linux/kernel.h> + +#include <asm/irq_vectors.h> +#include <asm/apic.h> +#include <asm/idle.h> +#include <asm/mce.h> + +static void default_threshold_interrupt(void) +{ + printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n", + THRESHOLD_APIC_VECTOR); +} + +void (*mce_threshold_vector)(void) = default_threshold_interrupt; + +asmlinkage void mce_threshold_interrupt(void) +{ + exit_idle(); + irq_enter(); + inc_irq_stat(irq_threshold_count); + mce_threshold_vector(); + irq_exit(); + /* Ack only at the end to avoid potential reentry */ + ack_APIC_irq(); +} diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 169a120587b..87b67e3a765 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -729,7 +729,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, spin_unlock_irqrestore(&ds_lock, irq); - ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); ds_resume_pebs(tracer); return tracer; @@ -1029,5 +1029,4 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) void ds_exit_thread(struct task_struct *tsk) { - WARN_ON(tsk->thread.ds_ctx); } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index b205272ad39..1736acc4d7a 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -469,7 +469,7 @@ void __init efi_enter_virtual_mode(void) efi_memory_desc_t *md; efi_status_t status; unsigned long size; - u64 end, systab, addr, npages; + u64 end, systab, addr, npages, end_pfn; void *p, *va; efi.systab = NULL; @@ -481,7 +481,10 @@ void __init efi_enter_virtual_mode(void) size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; - if (PFN_UP(end) <= max_low_pfn_mapped) + end_pfn = PFN_UP(end); + if (end_pfn <= max_low_pfn_mapped + || (end_pfn > (1UL << (32 - PAGE_SHIFT)) + && end_pfn <= max_pfn_mapped)) va = __va(md->phys_addr); else va = efi_ioremap(md->phys_addr, size); diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index a4ee29127fd..22c3b7828c5 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -100,24 +100,11 @@ void __init efi_call_phys_epilog(void) void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) { - static unsigned pages_mapped __initdata; - unsigned i, pages; - unsigned long offset; + unsigned long last_map_pfn; - pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr); - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - - if (pages_mapped + pages > MAX_EFI_IO_PAGES) + last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); + if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) return NULL; - for (i = 0; i < pages; i++) { - __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped, - phys_addr, PAGE_KERNEL); - phys_addr += PAGE_SIZE; - pages_mapped++; - } - - return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \ - (pages_mapped - pages)) + offset; + return (void __iomem *)__va(phys_addr); } diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 83d1836b946..7ba4621c0df 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -984,6 +984,8 @@ apicinterrupt UV_BAU_MESSAGE \ #endif apicinterrupt LOCAL_TIMER_VECTOR \ apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt GENERIC_INTERRUPT_VECTOR \ + generic_interrupt smp_generic_interrupt #ifdef CONFIG_SMP apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index b0f61f0dcd0..f2f8540a7f3 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk) #ifdef CONFIG_X86_32 if (!HAVE_HWFP) { memset(tsk->thread.xstate, 0, xstate_size); - finit(); + finit_task(tsk); set_stopped_child_used_math(tsk); return 0; } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index f13ca1650aa..b864341dcc4 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -15,6 +15,9 @@ atomic_t irq_err_count; +/* Function pointer for generic interrupt vector handling */ +void (*generic_interrupt_extension)(void) = NULL; + /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. @@ -56,6 +59,12 @@ static int show_other_interrupts(struct seq_file *p) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); seq_printf(p, " Local timer interrupts\n"); #endif + if (generic_interrupt_extension) { + seq_printf(p, "PLT: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); + seq_printf(p, " Platform interrupts\n"); + } #ifdef CONFIG_SMP seq_printf(p, "RES: "); for_each_online_cpu(j) @@ -163,6 +172,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; #endif + if (generic_interrupt_extension) + sum += irq_stats(cpu)->generic_irqs; #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; @@ -226,4 +237,27 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) return 1; } +/* + * Handler for GENERIC_INTERRUPT_VECTOR. + */ +void smp_generic_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + ack_APIC_irq(); + + exit_idle(); + + irq_enter(); + + inc_irq_stat(generic_irqs); + + if (generic_interrupt_extension) + generic_interrupt_extension(); + + irq_exit(); + + set_irq_regs(old_regs); +} + EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 9dc6b2b2427..3b09634a515 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -16,6 +16,7 @@ #include <linux/cpu.h> #include <linux/delay.h> #include <linux/uaccess.h> +#include <linux/percpu.h> #include <asm/apic.h> @@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { } union irq_ctx { struct thread_info tinfo; u32 stack[THREAD_SIZE/sizeof(u32)]; -}; +} __attribute__((aligned(PAGE_SIZE))); -static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; -static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; +static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); +static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); -static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; -static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; +static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack); +static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack); static void call_on_stack(void *func, void *stack) { @@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) u32 *isp, arg1, arg2; curctx = (union irq_ctx *) current_thread_info(); - irqctx = hardirq_ctx[smp_processor_id()]; + irqctx = __get_cpu_var(hardirq_ctx); /* * this is where we switch to the IRQ stack. However, if we are @@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu) { union irq_ctx *irqctx; - if (hardirq_ctx[cpu]) + if (per_cpu(hardirq_ctx, cpu)) return; - irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; + irqctx = &per_cpu(hardirq_stack, cpu); irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); - hardirq_ctx[cpu] = irqctx; + per_cpu(hardirq_ctx, cpu) = irqctx; - irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; + irqctx = &per_cpu(softirq_stack, cpu); irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = 0; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); - softirq_ctx[cpu] = irqctx; + per_cpu(softirq_ctx, cpu) = irqctx; printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", - cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); + cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); } void irq_ctx_exit(int cpu) { - hardirq_ctx[cpu] = NULL; + per_cpu(hardirq_ctx, cpu) = NULL; } asmlinkage void do_softirq(void) @@ -169,7 +170,7 @@ asmlinkage void do_softirq(void) if (local_softirq_pending()) { curctx = current_thread_info(); - irqctx = softirq_ctx[smp_processor_id()]; + irqctx = __get_cpu_var(softirq_ctx); irqctx->tinfo.task = curctx->task; irqctx->tinfo.previous_esp = current_stack_pointer; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 50b8c3a3006..bc132610544 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -175,6 +175,9 @@ void __init native_init_IRQ(void) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + /* generic IPI for platform specific use */ + alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); + /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index da481a1e3f3..c7a49e0ffbf 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -147,6 +147,9 @@ static void __init apic_intr_init(void) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + /* generic IPI for platform specific use */ + alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); + /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index f5fc8c781a6..e7368c1da01 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -14,12 +14,12 @@ #include <linux/ftrace.h> #include <linux/suspend.h> #include <linux/gfp.h> +#include <linux/io.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> -#include <asm/io.h> #include <asm/apic.h> #include <asm/cpufeature.h> #include <asm/desc.h> @@ -63,7 +63,7 @@ static void load_segments(void) "\tmovl %%eax,%%fs\n" "\tmovl %%eax,%%gs\n" "\tmovl %%eax,%%ss\n" - ::: "eax", "memory"); + : : : "eax", "memory"); #undef STR #undef __STR } @@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image) if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC - /* We need to put APICs in legacy mode so that we can + /* + * We need to put APICs in legacy mode so that we can * get timer interrupts in second kernel. kexec/kdump * paths already have calls to disable_IO_APIC() in * one form or other. kexec jump path also need @@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); - /* The segment registers are funny things, they have both a + /* + * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is * set to a specific selector, the invisible part is loaded * with from a table in memory. At no other time is the @@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image) * segments, before I zap the gdt with an invalid value. */ load_segments(); - /* The gdt & idt are now invalid. + /* + * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0),0); - set_idt(phys_to_virt(0),0); + set_gdt(phys_to_virt(0), 0); + set_idt(phys_to_virt(0), 0); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 6993d51b7fd..89cea4d4467 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -12,11 +12,47 @@ #include <linux/reboot.h> #include <linux/numa.h> #include <linux/ftrace.h> +#include <linux/io.h> +#include <linux/suspend.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> -#include <asm/io.h> + +static int init_one_level2_page(struct kimage *image, pgd_t *pgd, + unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + struct page *page; + int result = -ENOMEM; + + addr &= PMD_MASK; + pgd += pgd_index(addr); + if (!pgd_present(*pgd)) { + page = kimage_alloc_control_pages(image, 0); + if (!page) + goto out; + pud = (pud_t *)page_address(page); + memset(pud, 0, PAGE_SIZE); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) { + page = kimage_alloc_control_pages(image, 0); + if (!page) + goto out; + pmd = (pmd_t *)page_address(page); + memset(pmd, 0, PAGE_SIZE); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + result = 0; +out: + return result; +} static void init_level2_page(pmd_t *level2p, unsigned long addr) { @@ -83,9 +119,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p, } level3p = (pud_t *)page_address(page); result = init_level3_page(image, level3p, addr, last_addr); - if (result) { + if (result) goto out; - } set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); addr += PGDIR_SIZE; } @@ -156,6 +191,13 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); if (result) return result; + /* + * image->start may be outside 0 ~ max_pfn, for example when + * jump back to original kernel from kexeced kernel + */ + result = init_one_level2_page(image, level4p, image->start); + if (result) + return result; return init_transition_pgtable(image, level4p); } @@ -229,20 +271,45 @@ void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; + int save_ftrace_enabled; - tracer_disable(); +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + save_processor_state(); +#endif + + save_ftrace_enabled = __ftrace_enabled_save(); /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* + * We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to disable_IO_APIC() in + * one form or other. kexec jump path also need + * one. + */ + disable_IO_APIC(); +#endif + } + control_page = page_address(image->control_code_page) + PAGE_SIZE; - memcpy(control_page, relocate_kernel, PAGE_SIZE); + memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; page_list[PA_TABLE_PAGE] = (unsigned long)__pa(page_address(image->control_code_page)); - /* The segment registers are funny things, they have both a + if (image->type == KEXEC_TYPE_DEFAULT) + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) + << PAGE_SHIFT); + + /* + * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is * set to a specific selector, the invisible part is loaded * with from a table in memory. At no other time is the @@ -252,15 +319,25 @@ void machine_kexec(struct kimage *image) * segments, before I zap the gdt with an invalid value. */ load_segments(); - /* The gdt & idt are now invalid. + /* + * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0),0); - set_idt(phys_to_virt(0),0); + set_gdt(phys_to_virt(0), 0); + set_idt(phys_to_virt(0), 0); /* now call it */ - relocate_kernel((unsigned long)image->head, (unsigned long)page_list, - image->start); + image->start = relocate_kernel((unsigned long)image->head, + (unsigned long)page_list, + image->start, + image->preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + restore_processor_state(); +#endif + + __ftrace_enabled_restore(save_ftrace_enabled); } void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 37cb1bda1ba..e8192401da4 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -558,6 +558,19 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) static struct mpf_intel *mpf_found; +static unsigned long __init get_mpc_size(unsigned long physptr) +{ + struct mpc_table *mpc; + unsigned long size; + + mpc = early_ioremap(physptr, PAGE_SIZE); + size = mpc->length; + early_iounmap(mpc, PAGE_SIZE); + apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); + + return size; +} + /* * Scan the memory blocks for an SMP configuration block. */ @@ -611,12 +624,16 @@ static void __init __get_smp_config(unsigned int early) construct_default_ISA_mptable(mpf->feature1); } else if (mpf->physptr) { + struct mpc_table *mpc; + unsigned long size; + size = get_mpc_size(mpf->physptr); + mpc = early_ioremap(mpf->physptr, size); /* * Read the physical hardware table. Anything here will * override the defaults. */ - if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) { + if (!smp_read_mpc(mpc, early)) { #ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 0; #endif @@ -624,8 +641,10 @@ static void __init __get_smp_config(unsigned int early) "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. " "(tell your hw vendor)\n"); + early_iounmap(mpc, size); return; } + early_iounmap(mpc, size); if (early) return; @@ -697,10 +716,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, if (!reserve) return 1; - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, + reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf), BOOTMEM_DEFAULT); if (mpf->physptr) { - unsigned long size = PAGE_SIZE; + unsigned long size = get_mpc_size(mpf->physptr); #ifdef CONFIG_X86_32 /* * We cannot access to MPC table to compute diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 1cc18d439bb..2aef36d8aca 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -216,6 +216,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), }, }, + { /* Handle problems with rebooting on Dell XPS710 */ + .callback = set_bios_reboot, + .ident = "Dell XPS710", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), + }, + }, { } }; diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 2064d0aa8d2..41235531b11 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -17,7 +17,8 @@ #define PTR(x) (x << 2) -/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE +/* + * control_page + KEXEC_CONTROL_CODE_MAX_SIZE * ~ control_page + PAGE_SIZE are used as data storage and stack for * jumping back */ @@ -76,8 +77,10 @@ relocate_kernel: movl %eax, CP_PA_SWAP_PAGE(%edi) movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) - /* get physical address of control page now */ - /* this is impossible after page table switch */ + /* + * get physical address of control page now + * this is impossible after page table switch + */ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi /* switch to new set of page tables */ @@ -97,7 +100,8 @@ identity_mapped: /* store the start address on the stack */ pushl %edx - /* Set cr0 to a known state: + /* + * Set cr0 to a known state: * - Paging disabled * - Alignment check disabled * - Write protect disabled @@ -113,7 +117,8 @@ identity_mapped: /* clear cr4 if applicable */ testl %ecx, %ecx jz 1f - /* Set cr4 to a known state: + /* + * Set cr4 to a known state: * Setting everything to zero seems safe. */ xorl %eax, %eax @@ -132,15 +137,18 @@ identity_mapped: call swap_pages addl $8, %esp - /* To be certain of avoiding problems with self-modifying code + /* + * To be certain of avoiding problems with self-modifying code * I need to execute a serializing instruction here. * So I flush the TLB, it's handy, and not processor dependent. */ xorl %eax, %eax movl %eax, %cr3 - /* set all of the registers to known values */ - /* leave %esp alone */ + /* + * set all of the registers to known values + * leave %esp alone + */ testl %esi, %esi jnz 1f diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index d32cfb27a47..4de8f5b3d47 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -19,29 +19,77 @@ #define PTR(x) (x << 3) #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +/* + * control_page + KEXEC_CONTROL_CODE_MAX_SIZE + * ~ control_page + PAGE_SIZE are used as data storage and stack for + * jumping back + */ +#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + +/* Minimal CPU state */ +#define RSP DATA(0x0) +#define CR0 DATA(0x8) +#define CR3 DATA(0x10) +#define CR4 DATA(0x18) + +/* other data */ +#define CP_PA_TABLE_PAGE DATA(0x20) +#define CP_PA_SWAP_PAGE DATA(0x28) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x30) + .text .align PAGE_SIZE .code64 .globl relocate_kernel relocate_kernel: - /* %rdi indirection_page + /* + * %rdi indirection_page * %rsi page_list * %rdx start address + * %rcx preserve_context */ + /* Save the CPU context, used for jumping back */ + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushf + + movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 + movq %rsp, RSP(%r11) + movq %cr0, %rax + movq %rax, CR0(%r11) + movq %cr3, %rax + movq %rax, CR3(%r11) + movq %cr4, %rax + movq %rax, CR4(%r11) + /* zero out flags, and disable interrupts */ pushq $0 popfq - /* get physical address of control page now */ - /* this is impossible after page table switch */ + /* + * get physical address of control page now + * this is impossible after page table switch + */ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 /* get physical address of page table now too */ - movq PTR(PA_TABLE_PAGE)(%rsi), %rcx + movq PTR(PA_TABLE_PAGE)(%rsi), %r9 + + /* get physical address of swap page now */ + movq PTR(PA_SWAP_PAGE)(%rsi), %r10 + + /* save some information for jumping back */ + movq %r9, CP_PA_TABLE_PAGE(%r11) + movq %r10, CP_PA_SWAP_PAGE(%r11) + movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) /* Switch to the identity mapped page tables */ - movq %rcx, %cr3 + movq %r9, %cr3 /* setup a new stack at the end of the physical control page */ lea PAGE_SIZE(%r8), %rsp @@ -55,7 +103,8 @@ identity_mapped: /* store the start address on the stack */ pushq %rdx - /* Set cr0 to a known state: + /* + * Set cr0 to a known state: * - Paging enabled * - Alignment check disabled * - Write protect disabled @@ -68,7 +117,8 @@ identity_mapped: orl $(X86_CR0_PG | X86_CR0_PE), %eax movq %rax, %cr0 - /* Set cr4 to a known state: + /* + * Set cr4 to a known state: * - physical address extension enabled */ movq $X86_CR4_PAE, %rax @@ -78,9 +128,87 @@ identity_mapped: 1: /* Flush the TLB (needed?) */ - movq %rcx, %cr3 + movq %r9, %cr3 + + movq %rcx, %r11 + call swap_pages + + /* + * To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB by reloading %cr3 here, it's handy, + * and not processor dependent. + */ + movq %cr3, %rax + movq %rax, %cr3 + + /* + * set all of the registers to known values + * leave %rsp alone + */ + + testq %r11, %r11 + jnz 1f + xorq %rax, %rax + xorq %rbx, %rbx + xorq %rcx, %rcx + xorq %rdx, %rdx + xorq %rsi, %rsi + xorq %rdi, %rdi + xorq %rbp, %rbp + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r9 + xorq %r11, %r11 + xorq %r12, %r12 + xorq %r13, %r13 + xorq %r14, %r14 + xorq %r15, %r15 + + ret + +1: + popq %rdx + leaq PAGE_SIZE(%r10), %rsp + call *%rdx + + /* get the re-entry point of the peer system */ + movq 0(%rsp), %rbp + call 1f +1: + popq %r8 + subq $(1b - relocate_kernel), %r8 + movq CP_PA_SWAP_PAGE(%r8), %r10 + movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi + movq CP_PA_TABLE_PAGE(%r8), %rax + movq %rax, %cr3 + lea PAGE_SIZE(%r8), %rsp + call swap_pages + movq $virtual_mapped, %rax + pushq %rax + ret + +virtual_mapped: + movq RSP(%r8), %rsp + movq CR4(%r8), %rax + movq %rax, %cr4 + movq CR3(%r8), %rax + movq CR0(%r8), %r8 + movq %rax, %cr3 + movq %r8, %cr0 + movq %rbp, %rax + + popf + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret /* Do the copies */ +swap_pages: movq %rdi, %rcx /* Put the page_list in %rcx */ xorq %rdi, %rdi xorq %rsi, %rsi @@ -112,36 +240,27 @@ identity_mapped: movq %rcx, %rsi /* For ever source page do a copy */ andq $0xfffffffffffff000, %rsi + movq %rdi, %rdx + movq %rsi, %rax + + movq %r10, %rdi movq $512, %rcx rep ; movsq - jmp 0b -3: - - /* To be certain of avoiding problems with self-modifying code - * I need to execute a serializing instruction here. - * So I flush the TLB by reloading %cr3 here, it's handy, - * and not processor dependent. - */ - movq %cr3, %rax - movq %rax, %cr3 - /* set all of the registers to known values */ - /* leave %rsp alone */ + movq %rax, %rdi + movq %rdx, %rsi + movq $512, %rcx + rep ; movsq - xorq %rax, %rax - xorq %rbx, %rbx - xorq %rcx, %rcx - xorq %rdx, %rdx - xorq %rsi, %rsi - xorq %rdi, %rdi - xorq %rbp, %rbp - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r9 - xorq %r11, %r11 - xorq %r12, %r12 - xorq %r13, %r13 - xorq %r14, %r14 - xorq %r15, %r15 + movq %rdx, %rdi + movq %r10, %rsi + movq $512, %rcx + rep ; movsq + lea PAGE_SIZE(%rax), %rsi + jmp 0b +3: ret + + .globl kexec_control_code_size +.set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4c54bc0d8ff..f28c56e6bf9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -202,7 +202,9 @@ struct ist_info ist_info; #endif #else -struct cpuinfo_x86 boot_cpu_data __read_mostly; +struct cpuinfo_x86 boot_cpu_data __read_mostly = { + .x86_phys_bits = MAX_PHYSMEM_BITS, +}; EXPORT_SYMBOL(boot_cpu_data); #endif @@ -770,6 +772,9 @@ void __init setup_arch(char **cmdline_p) finish_e820_parsing(); + if (efi_enabled) + efi_init(); + dmi_scan_machine(); dmi_check_system(bad_bios_dmi_table); @@ -789,8 +794,6 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); - if (efi_enabled) - efi_init(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d992e6cff73..efa615f2bf4 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -7,6 +7,7 @@ #include <linux/crash_dump.h> #include <linux/smp.h> #include <linux/topology.h> +#include <linux/pfn.h> #include <asm/sections.h> #include <asm/processor.h> #include <asm/setup.h> @@ -41,6 +42,352 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +/* + * On x86_64 symbols referenced from code should be reachable using + * 32bit relocations. Reserve space for static percpu variables in + * modules so that they are always served from the first chunk which + * is located at the percpu segment base. On x86_32, anything can + * address anywhere. No need to reserve space in the first chunk. + */ +#ifdef CONFIG_X86_64 +#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE +#else +#define PERCPU_FIRST_CHUNK_RESERVE 0 +#endif + +/** + * pcpu_need_numa - determine percpu allocation needs to consider NUMA + * + * If NUMA is not configured or there is only one NUMA node available, + * there is no reason to consider NUMA. This function determines + * whether percpu allocation should consider NUMA or not. + * + * RETURNS: + * true if NUMA should be considered; otherwise, false. + */ +static bool __init pcpu_need_numa(void) +{ +#ifdef CONFIG_NEED_MULTIPLE_NODES + pg_data_t *last = NULL; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + int node = early_cpu_to_node(cpu); + + if (node_online(node) && NODE_DATA(node) && + last && last != NODE_DATA(node)) + return true; + + last = NODE_DATA(node); + } +#endif + return false; +} + +/** + * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu + * @cpu: cpu to allocate for + * @size: size allocation in bytes + * @align: alignment + * + * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper + * does the right thing for NUMA regardless of the current + * configuration. + * + * RETURNS: + * Pointer to the allocated area on success, NULL on failure. + */ +static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, + unsigned long align) +{ + const unsigned long goal = __pa(MAX_DMA_ADDRESS); +#ifdef CONFIG_NEED_MULTIPLE_NODES + int node = early_cpu_to_node(cpu); + void *ptr; + + if (!node_online(node) || !NODE_DATA(node)) { + ptr = __alloc_bootmem_nopanic(size, align, goal); + pr_info("cpu %d has no node %d or node-local memory\n", + cpu, node); + pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", + cpu, size, __pa(ptr)); + } else { + ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), + size, align, goal); + pr_debug("per cpu data for cpu%d %lu bytes on node%d at " + "%016lx\n", cpu, size, node, __pa(ptr)); + } + return ptr; +#else + return __alloc_bootmem_nopanic(size, align, goal); +#endif +} + +/* + * Remap allocator + * + * This allocator uses PMD page as unit. A PMD page is allocated for + * each cpu and each is remapped into vmalloc area using PMD mapping. + * As PMD page is quite large, only part of it is used for the first + * chunk. Unused part is returned to the bootmem allocator. + * + * So, the PMD pages are mapped twice - once to the physical mapping + * and to the vmalloc area for the first percpu chunk. The double + * mapping does add one more PMD TLB entry pressure but still is much + * better than only using 4k mappings while still being NUMA friendly. + */ +#ifdef CONFIG_NEED_MULTIPLE_NODES +static size_t pcpur_size __initdata; +static void **pcpur_ptrs __initdata; + +static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) +{ + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpur_size) + return NULL; + + return virt_to_page(pcpur_ptrs[cpu] + off); +} + +static ssize_t __init setup_pcpu_remap(size_t static_size) +{ + static struct vm_struct vm; + pg_data_t *last; + size_t ptrs_size, dyn_size; + unsigned int cpu; + ssize_t ret; + + /* + * If large page isn't supported, there's no benefit in doing + * this. Also, on non-NUMA, embedding is better. + */ + if (!cpu_has_pse || pcpu_need_numa()) + return -EINVAL; + + last = NULL; + for_each_possible_cpu(cpu) { + int node = early_cpu_to_node(cpu); + + if (node_online(node) && NODE_DATA(node) && + last && last != NODE_DATA(node)) + goto proceed; + + last = NODE_DATA(node); + } + return -EINVAL; + +proceed: + /* + * Currently supports only single page. Supporting multiple + * pages won't be too difficult if it ever becomes necessary. + */ + pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE); + if (pcpur_size > PMD_SIZE) { + pr_warning("PERCPU: static data is larger than large page, " + "can't use large page\n"); + return -EINVAL; + } + dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; + + /* allocate pointer array and alloc large pages */ + ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); + pcpur_ptrs = alloc_bootmem(ptrs_size); + + for_each_possible_cpu(cpu) { + pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); + if (!pcpur_ptrs[cpu]) + goto enomem; + + /* + * Only use pcpur_size bytes and give back the rest. + * + * Ingo: The 2MB up-rounding bootmem is needed to make + * sure the partial 2MB page is still fully RAM - it's + * not well-specified to have a PAT-incompatible area + * (unmapped RAM, device memory, etc.) in that hole. + */ + free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), + PMD_SIZE - pcpur_size); + + memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); + } + + /* allocate address and map */ + vm.flags = VM_ALLOC; + vm.size = num_possible_cpus() * PMD_SIZE; + vm_area_register_early(&vm, PMD_SIZE); + + for_each_possible_cpu(cpu) { + pmd_t *pmd; + + pmd = populate_extra_pmd((unsigned long)vm.addr + + cpu * PMD_SIZE); + set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), + PAGE_KERNEL_LARGE)); + } + + /* we're ready, commit */ + pr_info("PERCPU: Remapped at %p with large pages, static data " + "%zu bytes\n", vm.addr, static_size); + + ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, + PERCPU_FIRST_CHUNK_RESERVE, + PMD_SIZE, dyn_size, vm.addr, NULL); + goto out_free_ar; + +enomem: + for_each_possible_cpu(cpu) + if (pcpur_ptrs[cpu]) + free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpur_ptrs), ptrs_size); + return ret; +} +#else +static ssize_t __init setup_pcpu_remap(size_t static_size) +{ + return -EINVAL; +} +#endif + +/* + * Embedding allocator + * + * The first chunk is sized to just contain the static area plus + * module and dynamic reserves, and allocated as a contiguous area + * using bootmem allocator and used as-is without being mapped into + * vmalloc area. This enables the first chunk to piggy back on the + * linear physical PMD mapping and doesn't add any additional pressure + * to TLB. Note that if the needed size is smaller than the minimum + * unit size, the leftover is returned to the bootmem allocator. + */ +static void *pcpue_ptr __initdata; +static size_t pcpue_size __initdata; +static size_t pcpue_unit_size __initdata; + +static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) +{ + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpue_size) + return NULL; + + return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); +} + +static ssize_t __init setup_pcpu_embed(size_t static_size) +{ + unsigned int cpu; + size_t dyn_size; + + /* + * If large page isn't supported, there's no benefit in doing + * this. Also, embedding allocation doesn't play well with + * NUMA. + */ + if (!cpu_has_pse || pcpu_need_numa()) + return -EINVAL; + + /* allocate and copy */ + pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE); + pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); + dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; + + pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, + PAGE_SIZE); + if (!pcpue_ptr) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + void *ptr = pcpue_ptr + cpu * pcpue_unit_size; + + free_bootmem(__pa(ptr + pcpue_size), + pcpue_unit_size - pcpue_size); + memcpy(ptr, __per_cpu_load, static_size); + } + + /* we're ready, commit */ + pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", + pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); + + return pcpu_setup_first_chunk(pcpue_get_page, static_size, + PERCPU_FIRST_CHUNK_RESERVE, + pcpue_unit_size, dyn_size, + pcpue_ptr, NULL); +} + +/* + * 4k page allocator + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page and most of initialization is done by the generic + * setup function. + */ +static struct page **pcpu4k_pages __initdata; +static int pcpu4k_nr_static_pages __initdata; + +static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) +{ + if (pageno < pcpu4k_nr_static_pages) + return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; + return NULL; +} + +static void __init pcpu4k_populate_pte(unsigned long addr) +{ + populate_extra_pte(addr); +} + +static ssize_t __init setup_pcpu_4k(size_t static_size) +{ + size_t pages_size; + unsigned int cpu; + int i, j; + ssize_t ret; + + pcpu4k_nr_static_pages = PFN_UP(static_size); + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() + * sizeof(pcpu4k_pages[0])); + pcpu4k_pages = alloc_bootmem(pages_size); + + /* allocate and copy */ + j = 0; + for_each_possible_cpu(cpu) + for (i = 0; i < pcpu4k_nr_static_pages; i++) { + void *ptr; + + ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); + if (!ptr) + goto enomem; + + memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); + pcpu4k_pages[j++] = virt_to_page(ptr); + } + + /* we're ready, commit */ + pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", + pcpu4k_nr_static_pages, static_size); + + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, + PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL, + pcpu4k_populate_pte); + goto out_free_ar; + +enomem: + while (--j >= 0) + free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpu4k_pages), pages_size); + return ret; +} + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -61,38 +408,35 @@ static inline void setup_percpu_segment(int cpu) */ void __init setup_per_cpu_areas(void) { - ssize_t size; - char *ptr; - int cpu; - - /* Copy section for each CPU (we discard the original) */ - size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + size_t static_size = __per_cpu_end - __per_cpu_start; + unsigned int cpu; + unsigned long delta; + size_t pcpu_unit_size; + ssize_t ret; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); - pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); + /* + * Allocate percpu area. If PSE is supported, try to make use + * of large page mappings. Please read comments on top of + * each allocator for details. + */ + ret = setup_pcpu_remap(static_size); + if (ret < 0) + ret = setup_pcpu_embed(static_size); + if (ret < 0) + ret = setup_pcpu_4k(static_size); + if (ret < 0) + panic("cannot allocate static percpu area (%zu bytes, err=%zd)", + static_size, ret); - for_each_possible_cpu(cpu) { -#ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(size); -#else - int node = early_cpu_to_node(cpu); - if (!node_online(node) || !NODE_DATA(node)) { - ptr = alloc_bootmem_pages(size); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d at %016lx\n", - cpu, __pa(ptr)); - } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); - pr_debug("per cpu data for cpu%d on node%d at %016lx\n", - cpu, node, __pa(ptr)); - } -#endif + pcpu_unit_size = ret; - memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); - per_cpu_offset(cpu) = ptr - __per_cpu_start; + /* alrighty, percpu areas up and running */ + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) { + per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); @@ -125,8 +469,6 @@ void __init setup_per_cpu_areas(void) */ if (cpu == boot_cpu_id) switch_to_new_gdt(cpu); - - DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } /* indicate the early static arrays will soon be gone */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 249334f5080..ef7d10170c3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -114,10 +114,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; - -/* Set if we find a B stepping CPU */ -static int __cpuinitdata smp_b_stepping; - #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) /* which logical CPUs are on which nodes */ @@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void) cpumask_set_cpu(cpuid, cpu_callin_mask); } -static int __cpuinitdata unsafe_smp; - /* * Activate a secondary processor. */ @@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused) cpu_idle(); } -static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) -{ - /* - * Mask B, Pentium, but not Pentium MMX - */ - if (c->x86_vendor == X86_VENDOR_INTEL && - c->x86 == 5 && - c->x86_mask >= 1 && c->x86_mask <= 4 && - c->x86_model <= 3) - /* - * Remember we have B step Pentia with bugs - */ - smp_b_stepping = 1; - - /* - * Certain Athlons might work (for various values of 'work') in SMP - * but they are not certified as MP capable. - */ - if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) { - - if (num_possible_cpus() == 1) - goto valid_k7; - - /* Athlon 660/661 is valid. */ - if ((c->x86_model == 6) && ((c->x86_mask == 0) || - (c->x86_mask == 1))) - goto valid_k7; - - /* Duron 670 is valid */ - if ((c->x86_model == 7) && (c->x86_mask == 0)) - goto valid_k7; - - /* - * Athlon 662, Duron 671, and Athlon >model 7 have capability - * bit. It's worth noting that the A5 stepping (662) of some - * Athlon XP's have the MP bit set. - * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for - * more. - */ - if (((c->x86_model == 6) && (c->x86_mask >= 2)) || - ((c->x86_model == 7) && (c->x86_mask >= 1)) || - (c->x86_model > 7)) - if (cpu_has_mp) - goto valid_k7; - - /* If we get here, not a certified SMP capable AMD system. */ - unsafe_smp = 1; - } - -valid_k7: - ; -} - -static void __cpuinit smp_checks(void) -{ - if (smp_b_stepping) - printk(KERN_WARNING "WARNING: SMP operation may be unreliable" - "with B stepping processors.\n"); - - /* - * Don't taint if we are running SMP kernel on a single non-MP - * approved Athlon - */ - if (unsafe_smp && num_online_cpus() > 1) { - printk(KERN_INFO "WARNING: This combination of AMD" - "processors is not suitable for SMP.\n"); - add_taint(TAINT_UNSAFE_SMP); - } -} - /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU @@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id) c->cpu_index = id; if (id != 0) identify_secondary_cpu(c); - smp_apply_quirks(c); } @@ -1193,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) pr_debug("Boot done.\n"); impress_friends(); - smp_checks(); #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index f04549afcfe..d038b9c45cf 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -314,8 +314,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, int locals = 0; struct bau_desc *bau_desc; - WARN_ON(!in_atomic()); - cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); uv_cpu = uv_blade_processor_id(); diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c new file mode 100644 index 00000000000..2ffb6c53326 --- /dev/null +++ b/arch/x86/kernel/uv_time.c @@ -0,0 +1,393 @@ +/* + * SGI RTC clock/timer routines. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Dimitri Sivanich + */ +#include <linux/clockchips.h> + +#include <asm/uv/uv_mmrs.h> +#include <asm/uv/uv_hub.h> +#include <asm/uv/bios.h> +#include <asm/uv/uv.h> +#include <asm/apic.h> +#include <asm/cpu.h> + +#define RTC_NAME "sgi_rtc" + +static cycle_t uv_read_rtc(void); +static int uv_rtc_next_event(unsigned long, struct clock_event_device *); +static void uv_rtc_timer_setup(enum clock_event_mode, + struct clock_event_device *); + +static struct clocksource clocksource_uv = { + .name = RTC_NAME, + .rating = 400, + .read = uv_read_rtc, + .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, + .shift = 10, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static struct clock_event_device clock_event_device_uv = { + .name = RTC_NAME, + .features = CLOCK_EVT_FEAT_ONESHOT, + .shift = 20, + .rating = 400, + .irq = -1, + .set_next_event = uv_rtc_next_event, + .set_mode = uv_rtc_timer_setup, + .event_handler = NULL, +}; + +static DEFINE_PER_CPU(struct clock_event_device, cpu_ced); + +/* There is one of these allocated per node */ +struct uv_rtc_timer_head { + spinlock_t lock; + /* next cpu waiting for timer, local node relative: */ + int next_cpu; + /* number of cpus on this node: */ + int ncpus; + struct { + int lcpu; /* systemwide logical cpu number */ + u64 expires; /* next timer expiration for this cpu */ + } cpu[1]; +}; + +/* + * Access to uv_rtc_timer_head via blade id. + */ +static struct uv_rtc_timer_head **blade_info __read_mostly; + +static int uv_rtc_enable; + +/* + * Hardware interface routines + */ + +/* Send IPIs to another node */ +static void uv_rtc_send_IPI(int cpu) +{ + unsigned long apicid, val; + int pnode; + + apicid = cpu_physical_id(cpu); + pnode = uv_apicid_to_pnode(apicid); + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + (apicid << UVH_IPI_INT_APIC_ID_SHFT) | + (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); + + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); +} + +/* Check for an RTC interrupt pending */ +static int uv_intr_pending(int pnode) +{ + return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & + UVH_EVENT_OCCURRED0_RTC1_MASK; +} + +/* Setup interrupt and return non-zero if early expiration occurred. */ +static int uv_setup_intr(int cpu, u64 expires) +{ + u64 val; + int pnode = uv_cpu_to_pnode(cpu); + + uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, + UVH_RTC1_INT_CONFIG_M_MASK); + uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L); + + uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, + UVH_EVENT_OCCURRED0_RTC1_MASK); + + val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | + ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); + + /* Set configuration */ + uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); + /* Initialize comparator value */ + uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); + + return (expires < uv_read_rtc() && !uv_intr_pending(pnode)); +} + +/* + * Per-cpu timer tracking routines + */ + +static __init void uv_rtc_deallocate_timers(void) +{ + int bid; + + for_each_possible_blade(bid) { + kfree(blade_info[bid]); + } + kfree(blade_info); +} + +/* Allocate per-node list of cpu timer expiration times. */ +static __init int uv_rtc_allocate_timers(void) +{ + int cpu; + + blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL); + if (!blade_info) + return -ENOMEM; + memset(blade_info, 0, uv_possible_blades * sizeof(void *)); + + for_each_present_cpu(cpu) { + int nid = cpu_to_node(cpu); + int bid = uv_cpu_to_blade_id(cpu); + int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + struct uv_rtc_timer_head *head = blade_info[bid]; + + if (!head) { + head = kmalloc_node(sizeof(struct uv_rtc_timer_head) + + (uv_blade_nr_possible_cpus(bid) * + 2 * sizeof(u64)), + GFP_KERNEL, nid); + if (!head) { + uv_rtc_deallocate_timers(); + return -ENOMEM; + } + spin_lock_init(&head->lock); + head->ncpus = uv_blade_nr_possible_cpus(bid); + head->next_cpu = -1; + blade_info[bid] = head; + } + + head->cpu[bcpu].lcpu = cpu; + head->cpu[bcpu].expires = ULLONG_MAX; + } + + return 0; +} + +/* Find and set the next expiring timer. */ +static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode) +{ + u64 lowest = ULLONG_MAX; + int c, bcpu = -1; + + head->next_cpu = -1; + for (c = 0; c < head->ncpus; c++) { + u64 exp = head->cpu[c].expires; + if (exp < lowest) { + bcpu = c; + lowest = exp; + } + } + if (bcpu >= 0) { + head->next_cpu = bcpu; + c = head->cpu[bcpu].lcpu; + if (uv_setup_intr(c, lowest)) + /* If we didn't set it up in time, trigger */ + uv_rtc_send_IPI(c); + } else { + uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, + UVH_RTC1_INT_CONFIG_M_MASK); + } +} + +/* + * Set expiration time for current cpu. + * + * Returns 1 if we missed the expiration time. + */ +static int uv_rtc_set_timer(int cpu, u64 expires) +{ + int pnode = uv_cpu_to_pnode(cpu); + int bid = uv_cpu_to_blade_id(cpu); + struct uv_rtc_timer_head *head = blade_info[bid]; + int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + u64 *t = &head->cpu[bcpu].expires; + unsigned long flags; + int next_cpu; + + spin_lock_irqsave(&head->lock, flags); + + next_cpu = head->next_cpu; + *t = expires; + /* Will this one be next to go off? */ + if (next_cpu < 0 || bcpu == next_cpu || + expires < head->cpu[next_cpu].expires) { + head->next_cpu = bcpu; + if (uv_setup_intr(cpu, expires)) { + *t = ULLONG_MAX; + uv_rtc_find_next_timer(head, pnode); + spin_unlock_irqrestore(&head->lock, flags); + return 1; + } + } + + spin_unlock_irqrestore(&head->lock, flags); + return 0; +} + +/* + * Unset expiration time for current cpu. + * + * Returns 1 if this timer was pending. + */ +static int uv_rtc_unset_timer(int cpu) +{ + int pnode = uv_cpu_to_pnode(cpu); + int bid = uv_cpu_to_blade_id(cpu); + struct uv_rtc_timer_head *head = blade_info[bid]; + int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + u64 *t = &head->cpu[bcpu].expires; + unsigned long flags; + int rc = 0; + + spin_lock_irqsave(&head->lock, flags); + + if (head->next_cpu == bcpu && uv_read_rtc() >= *t) + rc = 1; + + *t = ULLONG_MAX; + + /* Was the hardware setup for this timer? */ + if (head->next_cpu == bcpu) + uv_rtc_find_next_timer(head, pnode); + + spin_unlock_irqrestore(&head->lock, flags); + + return rc; +} + + +/* + * Kernel interface routines. + */ + +/* + * Read the RTC. + */ +static cycle_t uv_read_rtc(void) +{ + return (cycle_t)uv_read_local_mmr(UVH_RTC); +} + +/* + * Program the next event, relative to now + */ +static int uv_rtc_next_event(unsigned long delta, + struct clock_event_device *ced) +{ + int ced_cpu = cpumask_first(ced->cpumask); + + return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc()); +} + +/* + * Setup the RTC timer in oneshot mode + */ +static void uv_rtc_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + int ced_cpu = cpumask_first(evt->cpumask); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here yet */ + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + uv_rtc_unset_timer(ced_cpu); + break; + } +} + +static void uv_rtc_interrupt(void) +{ + struct clock_event_device *ced = &__get_cpu_var(cpu_ced); + int cpu = smp_processor_id(); + + if (!ced || !ced->event_handler) + return; + + if (uv_rtc_unset_timer(cpu) != 1) + return; + + ced->event_handler(ced); +} + +static int __init uv_enable_rtc(char *str) +{ + uv_rtc_enable = 1; + + return 1; +} +__setup("uvrtc", uv_enable_rtc); + +static __init void uv_rtc_register_clockevents(struct work_struct *dummy) +{ + struct clock_event_device *ced = &__get_cpu_var(cpu_ced); + + *ced = clock_event_device_uv; + ced->cpumask = cpumask_of(smp_processor_id()); + clockevents_register_device(ced); +} + +static __init int uv_rtc_setup_clock(void) +{ + int rc; + + if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) + return -ENODEV; + + generic_interrupt_extension = uv_rtc_interrupt; + + clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, + clocksource_uv.shift); + + rc = clocksource_register(&clocksource_uv); + if (rc) { + generic_interrupt_extension = NULL; + return rc; + } + + /* Setup and register clockevents */ + rc = uv_rtc_allocate_timers(); + if (rc) { + clocksource_unregister(&clocksource_uv); + generic_interrupt_extension = NULL; + return rc; + } + + clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, + NSEC_PER_SEC, clock_event_device_uv.shift); + + clock_event_device_uv.min_delta_ns = NSEC_PER_SEC / + sn_rtc_cycles_per_second; + + clock_event_device_uv.max_delta_ns = clocksource_uv.mask * + (NSEC_PER_SEC / sn_rtc_cycles_per_second); + + rc = schedule_on_each_cpu(uv_rtc_register_clockevents); + if (rc) { + clocksource_unregister(&clocksource_uv); + generic_interrupt_extension = NULL; + uv_rtc_deallocate_timers(); + } + + return rc; +} +arch_initcall(uv_rtc_setup_clock); diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index fbfced6f680..5bf54e40c6e 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -275,3 +275,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), ASSERT((per_cpu__irq_stack_union == 0), "irq_stack_union is not at start of per-cpu area"); #endif + +#ifdef CONFIG_KEXEC +#include <asm/kexec.h> + +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big") +#endif diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index f3a5305b8ad..9fe4ddaa8f6 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -348,6 +348,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, * flush_tlb_user() for both user and kernel mappings unless * the Page Global Enable (PGE) feature bit is set. */ *dx |= 0x00002000; + /* We also lie, and say we're family id 5. 6 or greater + * leads to a rdmsr in early_init_intel which we can't handle. + * Family ID is returned as bits 8-12 in ax. */ + *ax &= 0xFFFFF0FF; + *ax |= 0x00000500; break; case 0x80000000: /* Futureproof this a little: if they ask how much extended @@ -594,19 +599,21 @@ static void __init lguest_init_IRQ(void) /* Some systems map "vectors" to interrupts weirdly. Lguest has * a straightforward 1 to 1 mapping, so force that here. */ __get_cpu_var(vector_irq)[vector] = i; - if (vector != SYSCALL_VECTOR) { - set_intr_gate(vector, - interrupt[vector-FIRST_EXTERNAL_VECTOR]); - set_irq_chip_and_handler_name(i, &lguest_irq_controller, - handle_level_irq, - "level"); - } + if (vector != SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); } /* This call is required to set up for 4k stacks, where we have * separate stacks for hard and soft interrupts. */ irq_ctx_init(smp_processor_id()); } +void lguest_setup_irq(unsigned int irq) +{ + irq_to_desc_alloc_cpu(irq, 0); + set_irq_chip_and_handler_name(irq, &lguest_irq_controller, + handle_level_irq, "level"); +} + /* * Time. * diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index 491e737ce54..aa098708877 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -30,20 +30,29 @@ static void fclex(void) } /* Needs to be externally visible */ -void finit(void) +void finit_task(struct task_struct *tsk) { - control_word = 0x037f; - partial_status = 0; - top = 0; /* We don't keep top in the status word internally. */ - fpu_tag_word = 0xffff; + struct i387_soft_struct *soft = &tsk->thread.xstate->soft; + struct address *oaddr, *iaddr; + soft->cwd = 0x037f; + soft->swd = 0; + soft->ftop = 0; /* We don't keep top in the status word internally. */ + soft->twd = 0xffff; /* The behaviour is different from that detailed in Section 15.1.6 of the Intel manual */ - operand_address.offset = 0; - operand_address.selector = 0; - instruction_address.offset = 0; - instruction_address.selector = 0; - instruction_address.opcode = 0; - no_ip_update = 1; + oaddr = (struct address *)&soft->foo; + oaddr->offset = 0; + oaddr->selector = 0; + iaddr = (struct address *)&soft->fip; + iaddr->offset = 0; + iaddr->selector = 0; + iaddr->opcode = 0; + soft->no_update = 1; +} + +void finit(void) +{ + finit_task(current); } /* diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 00f127c80b0..d11745334a6 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -158,7 +158,6 @@ EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); -#ifdef CONFIG_NUMA void __init set_highmem_pages_init(void) { struct zone *zone; @@ -182,11 +181,3 @@ void __init set_highmem_pages_init(void) } totalram_pages += totalhigh_pages; } -#else -void __init set_highmem_pages_init(void) -{ - add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); - - totalram_pages += totalhigh_pages; -} -#endif /* CONFIG_NUMA */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index ce6a722587d..15219e0d124 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1,8 +1,345 @@ +#include <linux/ioport.h> #include <linux/swap.h> + #include <asm/cacheflush.h> +#include <asm/e820.h> +#include <asm/init.h> #include <asm/page.h> +#include <asm/page_types.h> #include <asm/sections.h> #include <asm/system.h> +#include <asm/tlbflush.h> + +unsigned long __initdata e820_table_start; +unsigned long __meminitdata e820_table_end; +unsigned long __meminitdata e820_table_top; + +int after_bootmem; + +int direct_gbpages +#ifdef CONFIG_DIRECT_GBPAGES + = 1 +#endif +; + +static void __init find_early_table_space(unsigned long end, int use_pse, + int use_gbpages) +{ + unsigned long puds, pmds, ptes, tables, start; + + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + + if (use_gbpages) { + unsigned long extra; + + extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); + pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); + + if (use_pse) { + unsigned long extra; + + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); +#ifdef CONFIG_X86_32 + extra += PMD_SIZE; +#endif + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); + +#ifdef CONFIG_X86_32 + /* for fixmap */ + tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); +#endif + + /* + * RED-PEN putting page tables only on node 0 could + * cause a hotspot and fill up ZONE_DMA. The page tables + * need roughly 0.5KB per GB. + */ +#ifdef CONFIG_X86_32 + start = 0x7000; + e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, + tables, PAGE_SIZE); +#else /* CONFIG_X86_64 */ + start = 0x8000; + e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE); +#endif + if (e820_table_start == -1UL) + panic("Cannot find space for the kernel page tables"); + + e820_table_start >>= PAGE_SHIFT; + e820_table_end = e820_table_start; + e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); + + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", + end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); +} + +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +#ifdef CONFIG_X86_32 +#define NR_RANGE_MR 3 +#else /* CONFIG_X86_64 */ +#define NR_RANGE_MR 5 +#endif + +static int save_mr(struct map_range *mr, int nr_range, + unsigned long start_pfn, unsigned long end_pfn, + unsigned long page_size_mask) +{ + if (start_pfn < end_pfn) { + if (nr_range >= NR_RANGE_MR) + panic("run out of range for init_memory_mapping\n"); + mr[nr_range].start = start_pfn<<PAGE_SHIFT; + mr[nr_range].end = end_pfn<<PAGE_SHIFT; + mr[nr_range].page_size_mask = page_size_mask; + nr_range++; + } + + return nr_range; +} + +#ifdef CONFIG_X86_64 +static void __init init_gbpages(void) +{ + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +} +#else +static inline void init_gbpages(void) +{ +} +#endif + +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ +unsigned long __init_refok init_memory_mapping(unsigned long start, + unsigned long end) +{ + unsigned long page_size_mask = 0; + unsigned long start_pfn, end_pfn; + unsigned long ret = 0; + unsigned long pos; + + struct map_range mr[NR_RANGE_MR]; + int nr_range, i; + int use_pse, use_gbpages; + + printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); + + if (!after_bootmem) + init_gbpages(); + +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + use_pse = use_gbpages = 0; +#else + use_pse = cpu_has_pse; + use_gbpages = direct_gbpages; +#endif + +#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_PAE + set_nx(); + if (nx_enabled) + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); +#endif + + /* Enable PSE if available */ + if (cpu_has_pse) + set_in_cr4(X86_CR4_PSE); + + /* Enable PGE if available */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); + __supported_pte_mask |= _PAGE_GLOBAL; + } +#endif + + if (use_gbpages) + page_size_mask |= 1 << PG_LEVEL_1G; + if (use_pse) + page_size_mask |= 1 << PG_LEVEL_2M; + + memset(mr, 0, sizeof(mr)); + nr_range = 0; + + /* head if not big page alignment ? */ + start_pfn = start >> PAGE_SHIFT; + pos = start_pfn << PAGE_SHIFT; +#ifdef CONFIG_X86_32 + /* + * Don't use a large page for the first 2/4MB of memory + * because there are often fixed size MTRRs in there + * and overlapping MTRRs into large pages can cause + * slowdowns. + */ + if (pos == 0) + end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); + else + end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ + end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#endif + if (end_pfn > (end >> PAGE_SHIFT)) + end_pfn = end >> PAGE_SHIFT; + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + pos = end_pfn << PAGE_SHIFT; + } + + /* big page (2M) range */ + start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#ifdef CONFIG_X86_32 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ + end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) + end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); +#endif + + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<<PG_LEVEL_2M)); + pos = end_pfn << PAGE_SHIFT; + } + +#ifdef CONFIG_X86_64 + /* big page (1G) range */ + start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & + ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); + pos = end_pfn << PAGE_SHIFT; + } + + /* tail is not big page (1G) alignment */ + start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<<PG_LEVEL_2M)); + pos = end_pfn << PAGE_SHIFT; + } +#endif + + /* tail is not big page (2M) alignment */ + start_pfn = pos>>PAGE_SHIFT; + end_pfn = end>>PAGE_SHIFT; + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + + /* try to merge same page size and continuous */ + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { + unsigned long old_start; + if (mr[i].end != mr[i+1].start || + mr[i].page_size_mask != mr[i+1].page_size_mask) + continue; + /* move it */ + old_start = mr[i].start; + memmove(&mr[i], &mr[i+1], + (nr_range - 1 - i) * sizeof(struct map_range)); + mr[i--].start = old_start; + nr_range--; + } + + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG " %010lx - %010lx page %s\n", + mr[i].start, mr[i].end, + (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( + (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); + + /* + * Find space for the kernel direct mapping tables. + * + * Later we should allocate these tables in the local node of the + * memory mapped. Unfortunately this is done currently before the + * nodes are discovered. + */ + if (!after_bootmem) + find_early_table_space(end, use_pse, use_gbpages); + +#ifdef CONFIG_X86_32 + for (i = 0; i < nr_range; i++) + kernel_physical_mapping_init(mr[i].start, mr[i].end, + mr[i].page_size_mask); + ret = end; +#else /* CONFIG_X86_64 */ + for (i = 0; i < nr_range; i++) + ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, + mr[i].page_size_mask); +#endif + +#ifdef CONFIG_X86_32 + early_ioremap_page_table_range_init(); + + load_cr3(swapper_pg_dir); +#endif + +#ifdef CONFIG_X86_64 + if (!after_bootmem) + mmu_cr4_features = read_cr4(); +#endif + __flush_tlb_all(); + + if (!after_bootmem && e820_table_end > e820_table_start) + reserve_early(e820_table_start << PAGE_SHIFT, + e820_table_end << PAGE_SHIFT, "PGTABLE"); + + if (!after_bootmem) + early_memtest(start, end); + + return ret >> PAGE_SHIFT; +} + + +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) + return 0; + if (!page_is_ram(pagenr)) + return 1; + return 0; +} void free_init_pages(char *what, unsigned long begin, unsigned long end) { @@ -47,3 +384,10 @@ void free_initmem(void) (unsigned long)(&__init_begin), (unsigned long)(&__init_end)); } + +#ifdef CONFIG_BLK_DEV_INITRD +void free_initrd_mem(unsigned long start, unsigned long end) +{ + free_init_pages("initrd memory", start, end); +} +#endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0b087dcd2c1..db81e9a8556 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -49,6 +49,7 @@ #include <asm/paravirt.h> #include <asm/setup.h> #include <asm/cacheflush.h> +#include <asm/init.h> unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; @@ -58,19 +59,14 @@ unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); - -static unsigned long __initdata table_start; -static unsigned long __meminitdata table_end; -static unsigned long __meminitdata table_top; - -static int __initdata after_init_bootmem; +bool __read_mostly __vmalloc_start_set = false; static __init void *alloc_low_page(void) { - unsigned long pfn = table_end++; + unsigned long pfn = e820_table_end++; void *adr; - if (pfn >= table_top) + if (pfn >= e820_table_top) panic("alloc_low_page: ran out of memory"); adr = __va(pfn * PAGE_SIZE); @@ -90,7 +86,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { - if (after_init_bootmem) + if (after_bootmem) pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); else pmd_table = (pmd_t *)alloc_low_page(); @@ -117,7 +113,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { pte_t *page_table = NULL; - if (after_init_bootmem) { + if (after_bootmem) { #ifdef CONFIG_DEBUG_PAGEALLOC page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); #endif @@ -135,6 +131,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) return pte_offset_kernel(pmd, 0); } +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + int pgd_idx = pgd_index(vaddr); + int pmd_idx = pmd_index(vaddr); + + return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx; +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + int pte_idx = pte_index(vaddr); + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return one_page_table_init(pmd) + pte_idx; +} + static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, unsigned long vaddr, pte_t *lastpte) { @@ -151,12 +164,12 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, if (pmd_idx_kmap_begin != pmd_idx_kmap_end && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end - && ((__pa(pte) >> PAGE_SHIFT) < table_start - || (__pa(pte) >> PAGE_SHIFT) >= table_end)) { + && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start + || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { pte_t *newpte; int i; - BUG_ON(after_init_bootmem); + BUG_ON(after_bootmem); newpte = alloc_low_page(); for (i = 0; i < PTRS_PER_PTE; i++) set_pte(newpte + i, pte[i]); @@ -225,11 +238,14 @@ static inline int is_kernel_text(unsigned long addr) * of max_low_pfn pages, by creating page tables starting from address * PAGE_OFFSET: */ -static void __init kernel_physical_mapping_init(pgd_t *pgd_base, - unsigned long start_pfn, - unsigned long end_pfn, - int use_pse) +unsigned long __init +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask) { + int use_pse = page_size_mask == (1<<PG_LEVEL_2M); + unsigned long start_pfn, end_pfn; + pgd_t *pgd_base = swapper_pg_dir; int pgd_idx, pmd_idx, pte_ofs; unsigned long pfn; pgd_t *pgd; @@ -238,6 +254,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, unsigned pages_2m, pages_4k; int mapping_iter; + start_pfn = start >> PAGE_SHIFT; + end_pfn = end >> PAGE_SHIFT; + /* * First iteration will setup identity mapping using large/small pages * based on use_pse, with other attributes same as set by @@ -352,26 +371,6 @@ repeat: mapping_iter = 2; goto repeat; } -} - -/* - * devmem_is_allowed() checks to see if /dev/mem access to a certain address - * is valid. The argument is a physical page number. - * - * - * On x86, access has to be given to the first megabyte of ram because that area - * contains bios code and data regions used by X and dosemu and similar apps. - * Access has to be given to non-kernel-ram areas as well, these contain the PCI - * mmio resources as well as potential bios/acpi data regions. - */ -int devmem_is_allowed(unsigned long pagenr) -{ - if (pagenr <= 256) - return 1; - if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) - return 0; - if (!page_is_ram(pagenr)) - return 1; return 0; } @@ -528,8 +527,9 @@ void __init native_pagetable_setup_done(pgd_t *base) * be partially populated, and so it avoids stomping on any existing * mappings. */ -static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) +void __init early_ioremap_page_table_range_init(void) { + pgd_t *pgd_base = swapper_pg_dir; unsigned long vaddr, end; /* @@ -624,7 +624,7 @@ static int __init noexec_setup(char *str) } early_param("noexec", noexec_setup); -static void __init set_nx(void) +void __init set_nx(void) { unsigned int v[4], l, h; @@ -776,6 +776,8 @@ void __init initmem_init(unsigned long start_pfn, #ifdef CONFIG_FLATMEM max_mapnr = num_physpages; #endif + __vmalloc_start_set = true; + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); @@ -797,176 +799,66 @@ static void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } +static unsigned long __init setup_node_bootmem(int nodeid, + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long bootmap) +{ + unsigned long bootmap_size; + + /* don't touch min_low_pfn */ + bootmap_size = init_bootmem_node(NODE_DATA(nodeid), + bootmap >> PAGE_SHIFT, + start_pfn, end_pfn); + printk(KERN_INFO " node %d low ram: %08lx - %08lx\n", + nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); + printk(KERN_INFO " node %d bootmap %08lx - %08lx\n", + nodeid, bootmap, bootmap + bootmap_size); + free_bootmem_with_active_regions(nodeid, end_pfn); + early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); + + return bootmap + bootmap_size; +} + void __init setup_bootmem_allocator(void) { - int i; + int nodeid; unsigned long bootmap_size, bootmap; /* * Initialize the boot-time allocator (with low memory only): */ bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; - bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT, - max_pfn_mapped<<PAGE_SHIFT, bootmap_size, + bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size, PAGE_SIZE); if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n", bootmap_size); reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); - /* don't touch min_low_pfn */ - bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, - min_low_pfn, max_low_pfn); printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped<<PAGE_SHIFT); - printk(KERN_INFO " low ram: %08lx - %08lx\n", - min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT); - printk(KERN_INFO " bootmap %08lx - %08lx\n", - bootmap, bootmap + bootmap_size); - for_each_online_node(i) - free_bootmem_with_active_regions(i, max_low_pfn); - early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); - - after_init_bootmem = 1; -} - -static void __init find_early_table_space(unsigned long end, int use_pse) -{ - unsigned long puds, pmds, ptes, tables, start; - - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - - if (use_pse) { - unsigned long extra; + printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); - extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); - extra += PMD_SIZE; - ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else - ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + for_each_online_node(nodeid) { + unsigned long start_pfn, end_pfn; - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); - - /* for fixmap */ - tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); - - /* - * RED-PEN putting page tables only on node 0 could - * cause a hotspot and fill up ZONE_DMA. The page tables - * need roughly 0.5KB per GB. - */ - start = 0x7000; - table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, - tables, PAGE_SIZE); - if (table_start == -1UL) - panic("Cannot find space for the kernel page tables"); - - table_start >>= PAGE_SHIFT; - table_end = table_start; - table_top = table_start + (tables>>PAGE_SHIFT); - - printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, - (table_start << PAGE_SHIFT) + tables); -} - -unsigned long __init_refok init_memory_mapping(unsigned long start, - unsigned long end) -{ - pgd_t *pgd_base = swapper_pg_dir; - unsigned long start_pfn, end_pfn; - unsigned long big_page_start; -#ifdef CONFIG_DEBUG_PAGEALLOC - /* - * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. - * This will simplify cpa(), which otherwise needs to support splitting - * large pages into small in interrupt context, etc. - */ - int use_pse = 0; +#ifdef CONFIG_NEED_MULTIPLE_NODES + start_pfn = node_start_pfn[nodeid]; + end_pfn = node_end_pfn[nodeid]; + if (start_pfn > max_low_pfn) + continue; + if (end_pfn > max_low_pfn) + end_pfn = max_low_pfn; #else - int use_pse = cpu_has_pse; -#endif - - /* - * Find space for the kernel direct mapping tables. - */ - if (!after_init_bootmem) - find_early_table_space(end, use_pse); - -#ifdef CONFIG_X86_PAE - set_nx(); - if (nx_enabled) - printk(KERN_INFO "NX (Execute Disable) protection: active\n"); + start_pfn = 0; + end_pfn = max_low_pfn; #endif - - /* Enable PSE if available */ - if (cpu_has_pse) - set_in_cr4(X86_CR4_PSE); - - /* Enable PGE if available */ - if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); - __supported_pte_mask |= _PAGE_GLOBAL; - } - - /* - * Don't use a large page for the first 2/4MB of memory - * because there are often fixed size MTRRs in there - * and overlapping MTRRs into large pages can cause - * slowdowns. - */ - big_page_start = PMD_SIZE; - - if (start < big_page_start) { - start_pfn = start >> PAGE_SHIFT; - end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT); - } else { - /* head is not big page alignment ? */ - start_pfn = start >> PAGE_SHIFT; - end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - } - if (start_pfn < end_pfn) - kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0); - - /* big page range */ - start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - if (start_pfn < (big_page_start >> PAGE_SHIFT)) - start_pfn = big_page_start >> PAGE_SHIFT; - end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) - kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, - use_pse); - - /* tail is not big page alignment ? */ - start_pfn = end_pfn; - if (start_pfn > (big_page_start>>PAGE_SHIFT)) { - end_pfn = end >> PAGE_SHIFT; - if (start_pfn < end_pfn) - kernel_physical_mapping_init(pgd_base, start_pfn, - end_pfn, 0); + bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, + bootmap); } - early_ioremap_page_table_range_init(pgd_base); - - load_cr3(swapper_pg_dir); - - __flush_tlb_all(); - - if (!after_init_bootmem) - reserve_early(table_start << PAGE_SHIFT, - table_end << PAGE_SHIFT, "PGTABLE"); - - if (!after_init_bootmem) - early_memtest(start, end); - - return end >> PAGE_SHIFT; + after_bootmem = 1; } - /* * paging_init() sets up the page tables - note that the first 8MB are * already mapped by head.S. @@ -1200,13 +1092,6 @@ void mark_rodata_ro(void) } #endif -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_init_pages("initrd memory", start, end); -} -#endif - int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, int flags) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 724e537432e..54efa57d1c0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -48,6 +48,7 @@ #include <asm/kdebug.h> #include <asm/numa.h> #include <asm/cacheflush.h> +#include <asm/init.h> /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -61,12 +62,6 @@ static unsigned long dma_reserve __initdata; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -int direct_gbpages -#ifdef CONFIG_DIRECT_GBPAGES - = 1 -#endif -; - static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; @@ -87,12 +82,10 @@ early_param("gbpages", parse_direct_gbpages_on); * around without checking the pgd every time. */ -int after_bootmem; - pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; EXPORT_SYMBOL_GPL(__supported_pte_mask); -static int do_not_nx __cpuinitdata; +static int disable_nx __cpuinitdata; /* * noexec=on|off @@ -107,9 +100,9 @@ static int __init nonx_setup(char *str) return -EINVAL; if (!strncmp(str, "on", 2)) { __supported_pte_mask |= _PAGE_NX; - do_not_nx = 0; + disable_nx = 0; } else if (!strncmp(str, "off", 3)) { - do_not_nx = 1; + disable_nx = 1; __supported_pte_mask &= ~_PAGE_NX; } return 0; @@ -121,7 +114,7 @@ void __cpuinit check_efer(void) unsigned long efer; rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || do_not_nx) + if (!(efer & EFER_NX) || disable_nx) __supported_pte_mask &= ~_PAGE_NX; } @@ -168,34 +161,51 @@ static __ref void *spp_getpage(void) return ptr; } -void -set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + if (pgd_none(*pgd)) { + pud_t *pud = (pud_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, pud); + if (pud != pud_offset(pgd, 0)) + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); + } + return pud_offset(pgd, vaddr); +} - pud = pud_page + pud_index(vaddr); +static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) +{ if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); + pmd_t *pmd = (pmd_t *) spp_getpage(); pud_populate(&init_mm, pud, pmd); - if (pmd != pmd_offset(pud, 0)) { + if (pmd != pmd_offset(pud, 0)) printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", - pmd, pmd_offset(pud, 0)); - return; - } + pmd, pmd_offset(pud, 0)); } - pmd = pmd_offset(pud, vaddr); + return pmd_offset(pud, vaddr); +} + +static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) +{ if (pmd_none(*pmd)) { - pte = (pte_t *) spp_getpage(); + pte_t *pte = (pte_t *) spp_getpage(); pmd_populate_kernel(&init_mm, pmd, pte); - if (pte != pte_offset_kernel(pmd, 0)) { + if (pte != pte_offset_kernel(pmd, 0)) printk(KERN_ERR "PAGETABLE BUG #02!\n"); - return; - } } + return pte_offset_kernel(pmd, vaddr); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pud = pud_page + pud_index(vaddr); + pmd = fill_pmd(pud, vaddr); + pte = fill_pte(pmd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); set_pte(pte, new_pte); /* @@ -205,8 +215,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) __flush_tlb_one(vaddr); } -void -set_pte_vaddr(unsigned long vaddr, pte_t pteval) +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; pud_t *pud_page; @@ -223,6 +232,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval) set_pte_vaddr_pud(pud_page, vaddr, pteval); } +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = pgd_offset_k(vaddr); + pud = fill_pud(pgd, vaddr); + return fill_pmd(pud, vaddr); +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return fill_pte(pmd, vaddr); +} + /* * Create large page table mappings for a range of physical addresses. */ @@ -291,13 +318,9 @@ void __init cleanup_highmap(void) } } -static unsigned long __initdata table_start; -static unsigned long __meminitdata table_end; -static unsigned long __meminitdata table_top; - static __ref void *alloc_low_page(unsigned long *phys) { - unsigned long pfn = table_end++; + unsigned long pfn = e820_table_end++; void *adr; if (after_bootmem) { @@ -307,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys) return adr; } - if (pfn >= table_top) + if (pfn >= e820_table_top) panic("alloc_low_page: ran out of memory"); adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); @@ -547,58 +570,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, return phys_pud_init(pud, addr, end, page_size_mask); } -static void __init find_early_table_space(unsigned long end, int use_pse, - int use_gbpages) -{ - unsigned long puds, pmds, ptes, tables, start; - - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - if (use_gbpages) { - unsigned long extra; - extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); - pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - - if (use_pse) { - unsigned long extra; - extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); - ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else - ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); - - /* - * RED-PEN putting page tables only on node 0 could - * cause a hotspot and fill up ZONE_DMA. The page tables - * need roughly 0.5KB per GB. - */ - start = 0x8000; - table_start = find_e820_area(start, end, tables, PAGE_SIZE); - if (table_start == -1UL) - panic("Cannot find space for the kernel page tables"); - - table_start >>= PAGE_SHIFT; - table_end = table_start; - table_top = table_start + (tables >> PAGE_SHIFT); - - printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); -} - -static void __init init_gbpages(void) -{ - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; -} - -static unsigned long __meminit kernel_physical_mapping_init(unsigned long start, - unsigned long end, - unsigned long page_size_mask) +unsigned long __init +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask) { unsigned long next, last_map_addr = end; @@ -635,176 +610,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start, return last_map_addr; } -struct map_range { - unsigned long start; - unsigned long end; - unsigned page_size_mask; -}; - -#define NR_RANGE_MR 5 - -static int save_mr(struct map_range *mr, int nr_range, - unsigned long start_pfn, unsigned long end_pfn, - unsigned long page_size_mask) -{ - - if (start_pfn < end_pfn) { - if (nr_range >= NR_RANGE_MR) - panic("run out of range for init_memory_mapping\n"); - mr[nr_range].start = start_pfn<<PAGE_SHIFT; - mr[nr_range].end = end_pfn<<PAGE_SHIFT; - mr[nr_range].page_size_mask = page_size_mask; - nr_range++; - } - - return nr_range; -} - -/* - * Setup the direct mapping of the physical memory at PAGE_OFFSET. - * This runs before bootmem is initialized and gets pages directly from - * the physical memory. To access them they are temporarily mapped. - */ -unsigned long __init_refok init_memory_mapping(unsigned long start, - unsigned long end) -{ - unsigned long last_map_addr = 0; - unsigned long page_size_mask = 0; - unsigned long start_pfn, end_pfn; - unsigned long pos; - - struct map_range mr[NR_RANGE_MR]; - int nr_range, i; - int use_pse, use_gbpages; - - printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); - - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ - if (!after_bootmem) - init_gbpages(); - -#ifdef CONFIG_DEBUG_PAGEALLOC - /* - * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. - * This will simplify cpa(), which otherwise needs to support splitting - * large pages into small in interrupt context, etc. - */ - use_pse = use_gbpages = 0; -#else - use_pse = cpu_has_pse; - use_gbpages = direct_gbpages; -#endif - - if (use_gbpages) - page_size_mask |= 1 << PG_LEVEL_1G; - if (use_pse) - page_size_mask |= 1 << PG_LEVEL_2M; - - memset(mr, 0, sizeof(mr)); - nr_range = 0; - - /* head if not big page alignment ?*/ - start_pfn = start >> PAGE_SHIFT; - pos = start_pfn << PAGE_SHIFT; - end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - if (end_pfn > (end >> PAGE_SHIFT)) - end_pfn = end >> PAGE_SHIFT; - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - pos = end_pfn << PAGE_SHIFT; - } - - /* big page (2M) range*/ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) - end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<<PG_LEVEL_2M)); - pos = end_pfn << PAGE_SHIFT; - } - - /* big page (1G) range */ - start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & - ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); - pos = end_pfn << PAGE_SHIFT; - } - - /* tail is not big page (1G) alignment */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<<PG_LEVEL_2M)); - pos = end_pfn << PAGE_SHIFT; - } - - /* tail is not big page (2M) alignment */ - start_pfn = pos>>PAGE_SHIFT; - end_pfn = end>>PAGE_SHIFT; - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - - /* try to merge same page size and continuous */ - for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { - unsigned long old_start; - if (mr[i].end != mr[i+1].start || - mr[i].page_size_mask != mr[i+1].page_size_mask) - continue; - /* move it */ - old_start = mr[i].start; - memmove(&mr[i], &mr[i+1], - (nr_range - 1 - i) * sizeof (struct map_range)); - mr[i--].start = old_start; - nr_range--; - } - - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG " %010lx - %010lx page %s\n", - mr[i].start, mr[i].end, - (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( - (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); - - if (!after_bootmem) - find_early_table_space(end, use_pse, use_gbpages); - - for (i = 0; i < nr_range; i++) - last_map_addr = kernel_physical_mapping_init( - mr[i].start, mr[i].end, - mr[i].page_size_mask); - - if (!after_bootmem) - mmu_cr4_features = read_cr4(); - __flush_tlb_all(); - - if (!after_bootmem && table_end > table_start) - reserve_early(table_start << PAGE_SHIFT, - table_end << PAGE_SHIFT, "PGTABLE"); - - printk(KERN_INFO "last_map_addr: %lx end: %lx\n", - last_map_addr, end); - - if (!after_bootmem) - early_memtest(start, end); - - return last_map_addr >> PAGE_SHIFT; -} - #ifndef CONFIG_NUMA void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) { @@ -876,28 +681,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif /* CONFIG_MEMORY_HOTPLUG */ -/* - * devmem_is_allowed() checks to see if /dev/mem access to a certain address - * is valid. The argument is a physical page number. - * - * - * On x86, access has to be given to the first megabyte of ram because that area - * contains bios code and data regions used by X and dosemu and similar apps. - * Access has to be given to non-kernel-ram areas as well, these contain the PCI - * mmio resources as well as potential bios/acpi data regions. - */ -int devmem_is_allowed(unsigned long pagenr) -{ - if (pagenr <= 256) - return 1; - if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) - return 0; - if (!page_is_ram(pagenr)) - return 1; - return 0; -} - - static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; @@ -985,13 +768,6 @@ void mark_rodata_ro(void) #endif -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_init_pages("initrd memory", start, end); -} -#endif - int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, int flags) { diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 433f7bd4648..aca924a30ee 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -38,8 +38,7 @@ unsigned long __phys_addr(unsigned long x) } else { VIRTUAL_BUG_ON(x < PAGE_OFFSET); x -= PAGE_OFFSET; - VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : - !phys_addr_valid(x)); + VIRTUAL_BUG_ON(!phys_addr_valid(x)); } return x; } @@ -56,10 +55,8 @@ bool __virt_addr_valid(unsigned long x) if (x < PAGE_OFFSET) return false; x -= PAGE_OFFSET; - if (system_state == SYSTEM_BOOTING ? - x > MAXMEM : !phys_addr_valid(x)) { + if (!phys_addr_valid(x)) return false; - } } return pfn_valid(x >> PAGE_SHIFT); @@ -76,10 +73,9 @@ static inline int phys_addr_valid(unsigned long addr) #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { - /* VMALLOC_* aren't constants; not available at the boot time */ + /* VMALLOC_* aren't constants */ VIRTUAL_BUG_ON(x < PAGE_OFFSET); - VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && - is_vmalloc_addr((void *) x)); + VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); return x - PAGE_OFFSET; } EXPORT_SYMBOL(__phys_addr); @@ -89,7 +85,9 @@ bool __virt_addr_valid(unsigned long x) { if (x < PAGE_OFFSET) return false; - if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) + if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) + return false; + if (x >= FIXADDR_START) return false; return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); } @@ -508,13 +506,19 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr) return &bm_pte[pte_index(addr)]; } +static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; + void __init early_ioremap_init(void) { pmd_t *pmd; + int i; if (early_ioremap_debug) printk(KERN_INFO "early_ioremap_init()\n"); + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + slot_virt[i] = fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); memset(bm_pte, 0, sizeof(bm_pte)); pmd_populate_kernel(&init_mm, pmd, bm_pte); @@ -581,6 +585,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx) static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; + static int __init check_early_ioremap_leak(void) { int count = 0; @@ -602,7 +607,8 @@ static int __init check_early_ioremap_leak(void) } late_initcall(check_early_ioremap_leak); -static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) +static void __init __iomem * +__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) { unsigned long offset, last_addr; unsigned int nrpages; @@ -668,9 +674,9 @@ static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned lo --nrpages; } if (early_ioremap_debug) - printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); + printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]); - prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0)); + prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); return prev_map[slot]; } @@ -738,8 +744,3 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) } prev_map[slot] = NULL; } - -void __this_fixmap_does_not_exist(void) -{ - WARN_ON(1); -} diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 93d82038af4..6a518dd08a3 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -32,11 +32,14 @@ struct kmmio_fault_page { struct list_head list; struct kmmio_fault_page *release_next; unsigned long page; /* location of the fault page */ + bool old_presence; /* page presence prior to arming */ + bool armed; /* * Number of times this page has been registered as a part * of a probe. If zero, page is disarmed and this may be freed. - * Used only by writers (RCU). + * Used only by writers (RCU) and post_kmmio_handler(). + * Protected by kmmio_lock, when linked into kmmio_page_table. */ int count; }; @@ -105,57 +108,85 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) return NULL; } -static void set_page_present(unsigned long addr, bool present, - unsigned int *pglevel) +static void set_pmd_presence(pmd_t *pmd, bool present, bool *old) +{ + pmdval_t v = pmd_val(*pmd); + *old = !!(v & _PAGE_PRESENT); + v &= ~_PAGE_PRESENT; + if (present) + v |= _PAGE_PRESENT; + set_pmd(pmd, __pmd(v)); +} + +static void set_pte_presence(pte_t *pte, bool present, bool *old) +{ + pteval_t v = pte_val(*pte); + *old = !!(v & _PAGE_PRESENT); + v &= ~_PAGE_PRESENT; + if (present) + v |= _PAGE_PRESENT; + set_pte_atomic(pte, __pte(v)); +} + +static int set_page_presence(unsigned long addr, bool present, bool *old) { - pteval_t pteval; - pmdval_t pmdval; unsigned int level; - pmd_t *pmd; pte_t *pte = lookup_address(addr, &level); if (!pte) { pr_err("kmmio: no pte for page 0x%08lx\n", addr); - return; + return -1; } - if (pglevel) - *pglevel = level; - switch (level) { case PG_LEVEL_2M: - pmd = (pmd_t *)pte; - pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT; - if (present) - pmdval |= _PAGE_PRESENT; - set_pmd(pmd, __pmd(pmdval)); + set_pmd_presence((pmd_t *)pte, present, old); break; - case PG_LEVEL_4K: - pteval = pte_val(*pte) & ~_PAGE_PRESENT; - if (present) - pteval |= _PAGE_PRESENT; - set_pte_atomic(pte, __pte(pteval)); + set_pte_presence(pte, present, old); break; - default: pr_err("kmmio: unexpected page level 0x%x.\n", level); - return; + return -1; } __flush_tlb_one(addr); + return 0; } -/** Mark the given page as not present. Access to it will trigger a fault. */ -static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) +/* + * Mark the given page as not present. Access to it will trigger a fault. + * + * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the + * protection is ignored here. RCU read lock is assumed held, so the struct + * will not disappear unexpectedly. Furthermore, the caller must guarantee, + * that double arming the same virtual address (page) cannot occur. + * + * Double disarming on the other hand is allowed, and may occur when a fault + * and mmiotrace shutdown happen simultaneously. + */ +static int arm_kmmio_fault_page(struct kmmio_fault_page *f) { - set_page_present(page & PAGE_MASK, false, pglevel); + int ret; + WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); + if (f->armed) { + pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", + f->page, f->count, f->old_presence); + } + ret = set_page_presence(f->page, false, &f->old_presence); + WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); + f->armed = true; + return ret; } -/** Mark the given page as present. */ -static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) +/** Restore the given page to saved presence state. */ +static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) { - set_page_present(page & PAGE_MASK, true, pglevel); + bool tmp; + int ret = set_page_presence(f->page, f->old_presence, &tmp); + WARN_ONCE(ret < 0, + KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); + f->armed = false; } /* @@ -202,28 +233,32 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) ctx = &get_cpu_var(kmmio_ctx); if (ctx->active) { - disarm_kmmio_fault_page(faultpage->page, NULL); if (addr == ctx->addr) { /* - * On SMP we sometimes get recursive probe hits on the - * same address. Context is already saved, fall out. + * A second fault on the same page means some other + * condition needs handling by do_page_fault(), the + * page really not being present is the most common. */ - pr_debug("kmmio: duplicate probe hit on CPU %d, for " - "address 0x%08lx.\n", - smp_processor_id(), addr); - ret = 1; - goto no_kmmio_ctx; - } - /* - * Prevent overwriting already in-flight context. - * This should not happen, let's hope disarming at least - * prevents a panic. - */ - pr_emerg("kmmio: recursive probe hit on CPU %d, " + pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n", + addr, smp_processor_id()); + + if (!faultpage->old_presence) + pr_info("kmmio: unexpected secondary hit for " + "address 0x%08lx on CPU %d.\n", addr, + smp_processor_id()); + } else { + /* + * Prevent overwriting already in-flight context. + * This should not happen, let's hope disarming at + * least prevents a panic. + */ + pr_emerg("kmmio: recursive probe hit on CPU %d, " "for address 0x%08lx. Ignoring.\n", smp_processor_id(), addr); - pr_emerg("kmmio: previous hit was at 0x%08lx.\n", - ctx->addr); + pr_emerg("kmmio: previous hit was at 0x%08lx.\n", + ctx->addr); + disarm_kmmio_fault_page(faultpage); + } goto no_kmmio_ctx; } ctx->active++; @@ -244,7 +279,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) regs->flags &= ~X86_EFLAGS_IF; /* Now we set present bit in PTE and single step. */ - disarm_kmmio_fault_page(ctx->fpage->page, NULL); + disarm_kmmio_fault_page(ctx->fpage); /* * If another cpu accesses the same page while we are stepping, @@ -275,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); if (!ctx->active) { - pr_debug("kmmio: spurious debug trap on CPU %d.\n", + pr_warning("kmmio: spurious debug trap on CPU %d.\n", smp_processor_id()); goto out; } @@ -283,7 +318,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) if (ctx->probe && ctx->probe->post_handler) ctx->probe->post_handler(ctx->probe, condition, regs); - arm_kmmio_fault_page(ctx->fpage->page, NULL); + /* Prevent racing against release_kmmio_fault_page(). */ + spin_lock(&kmmio_lock); + if (ctx->fpage->count) + arm_kmmio_fault_page(ctx->fpage); + spin_unlock(&kmmio_lock); regs->flags &= ~X86_EFLAGS_TF; regs->flags |= ctx->saved_flags; @@ -315,20 +354,24 @@ static int add_kmmio_fault_page(unsigned long page) f = get_kmmio_fault_page(page); if (f) { if (!f->count) - arm_kmmio_fault_page(f->page, NULL); + arm_kmmio_fault_page(f); f->count++; return 0; } - f = kmalloc(sizeof(*f), GFP_ATOMIC); + f = kzalloc(sizeof(*f), GFP_ATOMIC); if (!f) return -1; f->count = 1; f->page = page; - list_add_rcu(&f->list, kmmio_page_list(f->page)); - arm_kmmio_fault_page(f->page, NULL); + if (arm_kmmio_fault_page(f)) { + kfree(f); + return -1; + } + + list_add_rcu(&f->list, kmmio_page_list(f->page)); return 0; } @@ -347,7 +390,7 @@ static void release_kmmio_fault_page(unsigned long page, f->count--; BUG_ON(f->count < 0); if (!f->count) { - disarm_kmmio_fault_page(f->page, NULL); + disarm_kmmio_fault_page(f); f->release_next = *release_list; *release_list = f; } @@ -408,23 +451,24 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head) static void remove_kmmio_fault_pages(struct rcu_head *head) { - struct kmmio_delayed_release *dr = container_of( - head, - struct kmmio_delayed_release, - rcu); + struct kmmio_delayed_release *dr = + container_of(head, struct kmmio_delayed_release, rcu); struct kmmio_fault_page *p = dr->release_list; struct kmmio_fault_page **prevp = &dr->release_list; unsigned long flags; + spin_lock_irqsave(&kmmio_lock, flags); while (p) { - if (!p->count) + if (!p->count) { list_del_rcu(&p->list); - else + prevp = &p->release_next; + } else { *prevp = p->release_next; - prevp = &p->release_next; + } p = p->release_next; } spin_unlock_irqrestore(&kmmio_lock, flags); + /* This is the real RCU destroy call. */ call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); } diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 0bcd7883d03..605c8be0621 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -100,6 +100,9 @@ static int __init parse_memtest(char *arg) { if (arg) memtest_pattern = simple_strtoul(arg, NULL, 0); + else + memtest_pattern = ARRAY_SIZE(patterns); + return 0; } diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 451fe95a035..3daefa04ace 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -416,10 +416,11 @@ void __init initmem_init(unsigned long start_pfn, for_each_online_node(nid) propagate_e820_map_node(nid); - for_each_online_node(nid) + for_each_online_node(nid) { memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; + } - NODE_DATA(0)->bdata = &bootmem_node_data[0]; setup_bootmem_allocator(); } diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index ab50a8d7402..427fd1b56df 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -1,5 +1,5 @@ /* - * Written by Pekka Paalanen, 2008 <pq@iki.fi> + * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi> */ #include <linux/module.h> #include <linux/io.h> @@ -9,35 +9,74 @@ static unsigned long mmio_address; module_param(mmio_address, ulong, 0); -MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); +MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " + "(or 8 MB if read_far is non-zero)."); + +static unsigned long read_far = 0x400100; +module_param(read_far, ulong, 0); +MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB " + "(default: 0x400100)."); + +static unsigned v16(unsigned i) +{ + return i * 12 + 7; +} + +static unsigned v32(unsigned i) +{ + return i * 212371 + 13; +} static void do_write_test(void __iomem *p) { unsigned int i; + pr_info(MODULE_NAME ": write test.\n"); mmiotrace_printk("Write test.\n"); + for (i = 0; i < 256; i++) iowrite8(i, p + i); + for (i = 1024; i < (5 * 1024); i += 2) - iowrite16(i * 12 + 7, p + i); + iowrite16(v16(i), p + i); + for (i = (5 * 1024); i < (16 * 1024); i += 4) - iowrite32(i * 212371 + 13, p + i); + iowrite32(v32(i), p + i); } static void do_read_test(void __iomem *p) { unsigned int i; + unsigned errs[3] = { 0 }; + pr_info(MODULE_NAME ": read test.\n"); mmiotrace_printk("Read test.\n"); + for (i = 0; i < 256; i++) - ioread8(p + i); + if (ioread8(p + i) != i) + ++errs[0]; + for (i = 1024; i < (5 * 1024); i += 2) - ioread16(p + i); + if (ioread16(p + i) != v16(i)) + ++errs[1]; + for (i = (5 * 1024); i < (16 * 1024); i += 4) - ioread32(p + i); + if (ioread32(p + i) != v32(i)) + ++errs[2]; + + mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n", + errs[0], errs[1], errs[2]); } -static void do_test(void) +static void do_read_far_test(void __iomem *p) { - void __iomem *p = ioremap_nocache(mmio_address, 0x4000); + pr_info(MODULE_NAME ": read far test.\n"); + mmiotrace_printk("Read far test.\n"); + + ioread32(p + read_far); +} + +static void do_test(unsigned long size) +{ + void __iomem *p = ioremap_nocache(mmio_address, size); if (!p) { pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); return; @@ -45,11 +84,15 @@ static void do_test(void) mmiotrace_printk("ioremap returned %p.\n", p); do_write_test(p); do_read_test(p); + if (read_far && read_far < size - 4) + do_read_far_test(p); iounmap(p); } static int __init init(void) { + unsigned long size = (read_far) ? (8 << 20) : (16 << 10); + if (mmio_address == 0) { pr_err(MODULE_NAME ": you have to use the module argument " "mmio_address.\n"); @@ -58,10 +101,11 @@ static int __init init(void) return -ENXIO; } - pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " - "in PCI address space, and writing " - "rubbish in there.\n", mmio_address); - do_test(); + pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI " + "address space, and writing 16 kB of rubbish in there.\n", + size >> 10, mmio_address); + do_test(size); + pr_info(MODULE_NAME ": All done.\n"); return 0; } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c52f4034c7f..82cd39a6cbd 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu) vcpup = &per_cpu(xen_vcpu_info, cpu); - info.mfn = virt_to_mfn(vcpup); + info.mfn = arbitrary_virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", @@ -301,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr) frames = mcs.args; for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - frames[f] = virt_to_mfn(va); + frames[f] = arbitrary_virt_to_mfn((void *)va); + make_lowmem_page_readonly((void *)va); + make_lowmem_page_readonly(mfn_to_virt(frames[f])); } MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); @@ -314,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t, unsigned int cpu, unsigned int i) { struct desc_struct *gdt = get_cpu_gdt_table(cpu); - xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); struct multicall_space mc = __xen_mc_entry(0); MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); @@ -488,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, break; default: { - xmaddr_t maddr = virt_to_machine(&dt[entry]); + xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); xen_mc_flush(); if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 319bd40a57c..cb6afa4ec95 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -276,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) p2m_top[topidx][idx] = mfn; } +unsigned long arbitrary_virt_to_mfn(void *vaddr) +{ + xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); + + return PFN_DOWN(maddr.maddr); +} + xmaddr_t arbitrary_virt_to_machine(void *vaddr) { unsigned long address = (unsigned long)vaddr; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 035582ae815..8d470562ffc 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -219,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) { struct vcpu_guest_context *ctxt; struct desc_struct *gdt; + unsigned long gdt_mfn; if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) return 0; @@ -248,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->ldt_ents = 0; BUG_ON((unsigned long)gdt & ~PAGE_MASK); + + gdt_mfn = arbitrary_virt_to_mfn(gdt); make_lowmem_page_readonly(gdt); + make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); - ctxt->gdt_frames[0] = virt_to_mfn(gdt); + ctxt->gdt_frames[0] = gdt_mfn; ctxt->gdt_ents = GDT_ENTRIES; ctxt->user_regs.cs = __KERNEL_CS; |