From 7c9f8861e6c9c839f913e49b98c3854daca18f27 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 22 Apr 2008 16:38:23 -0500 Subject: stackprotector: use canary at end of stack to indicate overruns at oops time (Updated with a common max-stack-used checker that knows about the canary, as suggested by Joe Perches) Use a canary at the end of the stack to clearly indicate at oops time whether the stack has ever overflowed. This is a very simple implementation with a couple of drawbacks: 1) a thread may legitimately use exactly up to the last word on the stack -- but the chances of doing this and then oopsing later seem slim 2) it's possible that the stack usage isn't dense enough that the canary location could get skipped over -- but the worst that happens is that we don't flag the overrun -- though this happens fairly often in my testing :( With the code in place, an intentionally-bloated stack oops might do: BUG: unable to handle kernel paging request at ffff8103f84cc680 IP: [] update_curr+0x9a/0xa8 PGD 8063 PUD 0 Thread overran stack or stack corrupted Oops: 0000 [1] SMP CPU 0 ... ... unless the stack overrun is so bad that it corrupts some other thread. Signed-off-by: Eric Sandeen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/fault.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fd7e1798c75..1f524df68b9 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -581,6 +582,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) unsigned long address; int write, si_code; int fault; + unsigned long *stackend; + #ifdef CONFIG_X86_64 unsigned long flags; #endif @@ -850,6 +853,10 @@ no_context: show_fault_oops(regs, error_code, address); + stackend = end_of_stack(tsk); + if (*stackend != STACK_END_MAGIC) + printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); + tsk->thread.cr2 = address; tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; -- cgit v1.2.3-70-g09d2 From dacf7333571d770366bff74d10b56aa545434605 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 7 Jan 2009 17:26:35 +0530 Subject: x86: smp.h move zap_low_mappings declartion to tlbflush.h Impact: cleanup, moving NON-SMP stuff from smp.h Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/include/asm/smp.h | 2 -- arch/x86/include/asm/tlbflush.h | 2 ++ arch/x86/mm/init_32.c | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 83a4cc07431..64c9e848f13 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -22,8 +22,6 @@ extern cpumask_t cpu_callout_map; extern cpumask_t cpu_initialized; extern cpumask_t cpu_callin_map; -extern void zap_low_mappings(void); - extern int __cpuinit get_local_pda(int cpu); extern int smp_num_siblings; diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 0e7bbb54911..aed0b700b83 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -175,4 +175,6 @@ static inline void flush_tlb_kernel_range(unsigned long start, flush_tlb_all(); } +extern void zap_low_mappings(void); + #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f99a6c6c432..a9dd0b7ad61 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -49,7 +49,6 @@ #include #include #include -#include unsigned int __VMALLOC_RESERVE = 128 << 20; -- cgit v1.2.3-70-g09d2 From 92181f190b649f7ef2b79cbf5c00f26ccc66da2a Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 20 Jan 2009 04:24:26 +0100 Subject: x86: optimise x86's do_page_fault (C entry point for the page fault path) Impact: cleanup, restructure code to improve assembly gcc isn't _all_ that smart about spilling registers to stack or reusing stack slots, even with branch annotations. do_page_fault contained a lot of functionality, so split unlikely paths into their own functions, and mark them as noinline just to be sure. I consider this actually to be somewhat of a cleanup too: the main function now contains about half the number of lines so the normal path is easier to read, while the error cases are also nicely split away. Also, ensure the order of arguments to functions is always the same: regs, addr, error_code. This can reduce code size a tiny bit, and just looks neater too. And add a couple of branch annotations. Before: do_page_fault: subq $360, %rsp #, After: do_page_fault: subq $56, %rsp #, bloat-o-meter: add/remove: 8/0 grow/shrink: 0/1 up/down: 2222/-1680 (542) function old new delta __bad_area_nosemaphore - 506 +506 no_context - 474 +474 vmalloc_fault - 424 +424 spurious_fault - 358 +358 mm_fault_error - 272 +272 bad_area_access_error - 89 +89 bad_area - 89 +89 bad_area_nosemaphore - 10 +10 do_page_fault 2464 784 -1680 Yes, the total size increases by 542 bytes, due to the extra function calls. But these will very rarely be called (except for vmalloc_fault) in a normal workload. Importantly, do_page_fault is less than 1/3rd it's original size, and touches far less stack. Existing gotos and branch hints did move a lot of the infrequently used text out of the fastpath, but that's even further improved after this patch. Signed-off-by: Nick Piggin Acked-by: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 438 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 256 insertions(+), 182 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 90dfae511a4..033292dc9e2 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -91,8 +91,8 @@ static inline int notify_page_fault(struct pt_regs *regs) * * Opcode checker based on code by Richard Brunner */ -static int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) +static int is_prefetch(struct pt_regs *regs, unsigned long error_code, + unsigned long addr) { unsigned char *instr; int scan_more = 1; @@ -409,15 +409,15 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, } #ifdef CONFIG_X86_64 -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, - unsigned long error_code) +static noinline void pgtable_bad(struct pt_regs *regs, + unsigned long error_code, unsigned long address) { unsigned long flags = oops_begin(); int sig = SIGKILL; - struct task_struct *tsk; + struct task_struct *tsk = current; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", - current->comm, address); + tsk->comm, address); dump_pagetable(address); tsk = current; tsk->thread.cr2 = address; @@ -429,6 +429,190 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, } #endif +static noinline void no_context(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + struct task_struct *tsk = current; +#ifdef CONFIG_X86_64 + unsigned long flags; + int sig; +#endif + + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * X86_32 + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * X86_64 + * Hall of shame of CPU/BIOS bugs. + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata93(regs, address)) + return; + + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ +#ifdef CONFIG_X86_32 + bust_spinlocks(1); +#else + flags = oops_begin(); +#endif + + show_fault_oops(regs, error_code, address); + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + +#ifdef CONFIG_X86_32 + die("Oops", regs, error_code); + bust_spinlocks(0); + do_exit(SIGKILL); +#else + sig = SIGKILL; + if (__die("Oops", regs, error_code)) + sig = 0; + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags, regs, sig); +#endif +} + +static void __bad_area_nosemaphore(struct pt_regs *regs, + unsigned long error_code, unsigned long address, + int si_code) +{ + struct task_struct *tsk = current; + + /* User mode accesses just cause a SIGSEGV */ + if (error_code & PF_USER) { + /* + * It's possible to have interrupts off here. + */ + local_irq_enable(); + + /* + * Valid to do another page fault here because this one came + * from user space. + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata100(regs, address)) + return; + + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk( + "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *) regs->ip, (void *) regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } + + tsk->thread.cr2 = address; + /* Kernel addresses are always protection faults */ + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGSEGV, si_code, address, tsk); + return; + } + + if (is_f00f_bug(regs, address)) + return; + + no_context(regs, error_code, address); +} + +static noinline void bad_area_nosemaphore(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); +} + +static void __bad_area(struct pt_regs *regs, + unsigned long error_code, unsigned long address, + int si_code) +{ + struct mm_struct *mm = current->mm; + + /* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ + up_read(&mm->mmap_sem); + + __bad_area_nosemaphore(regs, error_code, address, si_code); +} + +static noinline void bad_area(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + __bad_area(regs, error_code, address, SEGV_MAPERR); +} + +static noinline void bad_area_access_error(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + __bad_area(regs, error_code, address, SEGV_ACCERR); +} + +/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ +static void out_of_memory(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + /* + * We ran out of memory, call the OOM killer, and return the userspace + * (which will retry the fault, or kill us if we got oom-killed). + */ + up_read(¤t->mm->mmap_sem); + pagefault_out_of_memory(); +} + +static void do_sigbus(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die */ + if (!(error_code & PF_USER)) + no_context(regs, error_code, address); +#ifdef CONFIG_X86_32 + /* User space => ok to do another page fault */ + if (is_prefetch(regs, error_code, address)) + return; +#endif + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); +} + +static noinline void mm_fault_error(struct pt_regs *regs, + unsigned long error_code, unsigned long address, unsigned int fault) +{ + if (fault & VM_FAULT_OOM) + out_of_memory(regs, error_code, address); + else if (fault & VM_FAULT_SIGBUS) + do_sigbus(regs, error_code, address); + else + BUG(); +} + static int spurious_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & PF_WRITE) && !pte_write(*pte)) @@ -448,8 +632,8 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static int spurious_fault(unsigned long address, - unsigned long error_code) +static noinline int spurious_fault(unsigned long error_code, + unsigned long address) { pgd_t *pgd; pud_t *pud; @@ -494,7 +678,7 @@ static int spurious_fault(unsigned long address, * * This assumes no large pages in there. */ -static int vmalloc_fault(unsigned long address) +static noinline int vmalloc_fault(unsigned long address) { #ifdef CONFIG_X86_32 unsigned long pgd_paddr; @@ -573,6 +757,25 @@ static int vmalloc_fault(unsigned long address) int show_unhandled_signals = 1; +static inline int access_error(unsigned long error_code, int write, + struct vm_area_struct *vma) +{ + if (write) { + /* write, present and write, not present */ + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return 1; + } else if (unlikely(error_code & PF_PROT)) { + /* read, present */ + return 1; + } else { + /* read, not present */ + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return 1; + } + + return 0; +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -583,16 +786,12 @@ asmlinkage #endif void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) { + unsigned long address; struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; - unsigned long address; - int write, si_code; + int write; int fault; -#ifdef CONFIG_X86_64 - unsigned long flags; - int sig; -#endif tsk = current; mm = tsk->mm; @@ -601,9 +800,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* get the address */ address = read_cr2(); - si_code = SEGV_MAPERR; - - if (notify_page_fault(regs)) + if (unlikely(notify_page_fault(regs))) return; if (unlikely(kmmio_fault(regs, address))) return; @@ -631,17 +828,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) return; /* Can handle a stale RO->RW TLB */ - if (spurious_fault(address, error_code)) + if (spurious_fault(error_code, address)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock. */ - goto bad_area_nosemaphore; + bad_area_nosemaphore(regs, error_code, address); + return; } - /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. @@ -657,15 +854,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) #ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) - pgtable_bad(address, regs, error_code); + pgtable_bad(regs, error_code, address); #endif /* * If we're in an interrupt, have no user context or are running in an * atomic region then we must not take the fault. */ - if (unlikely(in_atomic() || !mm)) - goto bad_area_nosemaphore; + if (unlikely(in_atomic() || !mm)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } /* * When running in the kernel we expect faults to occur only to @@ -683,20 +882,26 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * source. If this is invalid we can skip the address space check, * thus avoiding the deadlock. */ - if (!down_read_trylock(&mm->mmap_sem)) { + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && - !search_exception_tables(regs->ip)) - goto bad_area_nosemaphore; + !search_exception_tables(regs->ip)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } down_read(&mm->mmap_sem); } vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) + if (unlikely(!vma)) { + bad_area(regs, error_code, address); + return; + } + if (likely(vma->vm_start <= address)) goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { + bad_area(regs, error_code, address); + return; + } if (error_code & PF_USER) { /* * Accessing the stack below %sp is always a bug. @@ -704,31 +909,25 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * and pusha to work. ("enter $65535,$31" pushes * 32 pointers and then decrements %sp by 65535.) */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) - goto bad_area; + if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { + bad_area(regs, error_code, address); + return; + } } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ + if (unlikely(expand_stack(vma, address))) { + bad_area(regs, error_code, address); + return; + } + + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ good_area: - si_code = SEGV_ACCERR; - write = 0; - switch (error_code & (PF_PROT|PF_WRITE)) { - default: /* 3: write, present */ - /* fall through */ - case PF_WRITE: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case PF_PROT: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; + write = error_code & PF_WRITE; + if (unlikely(access_error(error_code, write, vma))) { + bad_area_access_error(regs, error_code, address); + return; } /* @@ -738,11 +937,8 @@ good_area: */ fault = handle_mm_fault(mm, vma, address, write); if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); + mm_fault_error(regs, error_code, address, fault); + return; } if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; @@ -760,128 +956,6 @@ good_area: } #endif up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { - /* - * It's possible to have interrupts off here. - */ - local_irq_enable(); - - /* - * Valid to do another page fault here because this one came - * from user space. - */ - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata100(regs, address)) - return; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, - (void *) regs->ip, (void *) regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); - return; - } - - if (is_f00f_bug(regs, address)) - return; - -no_context: - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) - return; - - /* - * X86_32 - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. - * - * X86_64 - * Hall of shame of CPU/BIOS bugs. - */ - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata93(regs, address)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ -#ifdef CONFIG_X86_32 - bust_spinlocks(1); -#else - flags = oops_begin(); -#endif - - show_fault_oops(regs, error_code, address); - - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - -#ifdef CONFIG_X86_32 - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); -#else - sig = SIGKILL; - if (__die("Oops", regs, error_code)) - sig = 0; - /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags, regs, sig); -#endif - -out_of_memory: - /* - * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed). - */ - up_read(&mm->mmap_sem); - pagefault_out_of_memory(); - return; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & PF_USER)) - goto no_context; -#ifdef CONFIG_X86_32 - /* User space => ok to do another page fault */ - if (is_prefetch(regs, address, error_code)) - return; -#endif - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } DEFINE_SPINLOCK(pgd_lock); -- cgit v1.2.3-70-g09d2 From 55f4949f5765e7a29863b6d17a774601810732f5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 21 Jan 2009 10:08:53 +0100 Subject: x86, mm: move tlb.c to arch/x86/mm/ Impact: cleanup Now that it's unified, move the (SMP) TLB flushing code from arch/x86/kernel/ to arch/x86/mm/, where it belongs logically. Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/tlb.c | 296 ----------------------------------------------- arch/x86/mm/Makefile | 2 + arch/x86/mm/tlb.c | 296 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 299 insertions(+), 297 deletions(-) delete mode 100644 arch/x86/kernel/tlb.c create mode 100644 arch/x86/mm/tlb.c (limited to 'arch/x86/mm') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0626a88fbb4..0b3272f58bd 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -58,7 +58,7 @@ obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp.o -obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o tlb.o +obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o obj-$(CONFIG_X86_32_SMP) += smpcommon.o obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o diff --git a/arch/x86/kernel/tlb.c b/arch/x86/kernel/tlb.c deleted file mode 100644 index b3ca1b94065..00000000000 --- a/arch/x86/kernel/tlb.c +++ /dev/null @@ -1,296 +0,0 @@ -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) - = { &init_mm, 0, }; - -#include -/* - * Smarter SMP flushing macros. - * c/o Linus Torvalds. - * - * These mean you can really definitely utterly forget about - * writing to user space from interrupts. (Its not allowed anyway). - * - * Optimizations Manfred Spraul - * - * More scalable flush, from Andi Kleen - * - * To avoid global state use 8 different call vectors. - * Each CPU uses a specific vector to trigger flushes on other - * CPUs. Depending on the received vector the target CPUs look into - * the right per cpu variable for the flush data. - * - * With more than 8 CPUs they are hashed to the 8 available - * vectors. The limited global vector space forces us to this right now. - * In future when interrupts are split into per CPU domains this could be - * fixed, at the cost of triggering multiple IPIs in some cases. - */ - -union smp_flush_state { - struct { - struct mm_struct *flush_mm; - unsigned long flush_va; - spinlock_t tlbstate_lock; - DECLARE_BITMAP(flush_cpumask, NR_CPUS); - }; - char pad[SMP_CACHE_BYTES]; -} ____cacheline_aligned; - -/* State is put into the per CPU data section, but padded - to a full cache line because other CPUs can access it and we don't - want false sharing in the per cpu data segment. */ -static DEFINE_PER_CPU(union smp_flush_state, flush_state); - -/* - * We cannot call mmdrop() because we are in interrupt context, - * instead update mm->cpu_vm_mask. - */ -void leave_mm(int cpu) -{ - if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - BUG(); - cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); - load_cr3(swapper_pg_dir); -} -EXPORT_SYMBOL_GPL(leave_mm); - -/* - * - * The flush IPI assumes that a thread switch happens in this order: - * [cpu0: the cpu that switches] - * 1) switch_mm() either 1a) or 1b) - * 1a) thread switch to a different mm - * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superfluous - * tlb flush. - * 1a2) set cpu mmu_state to TLBSTATE_OK - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 - * was in lazy tlb mode. - * 1a3) update cpu active_mm - * Now cpu0 accepts tlb flushes for the new mm. - * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); - * Now the other cpus will send tlb flush ipis. - * 1a4) change cr3. - * 1b) thread switch without mm change - * cpu active_mm is correct, cpu0 already handles - * flush ipis. - * 1b1) set cpu mmu_state to TLBSTATE_OK - * 1b2) test_and_set the cpu bit in cpu_vm_mask. - * Atomically set the bit [other cpus will start sending flush ipis], - * and test the bit. - * 1b3) if the bit was 0: leave_mm was called, flush the tlb. - * 2) switch %%esp, ie current - * - * The interrupt must handle 2 special cases: - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. - * - the cpu performs speculative tlb reads, i.e. even if the cpu only - * runs in kernel space, the cpu could load tlb entries for user space - * pages. - * - * The good news is that cpu mmu_state is local to each cpu, no - * write/read ordering problems. - */ - -/* - * TLB flush IPI: - * - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. - * 2) Leave the mm if we are in the lazy tlb mode. - * - * Interrupts are disabled. - */ - -/* - * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop - * but still used for documentation purpose but the usage is slightly - * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt - * entry calls in with the first parameter in %eax. Maybe define - * intrlinkage? - */ -#ifdef CONFIG_X86_64 -asmlinkage -#endif -void smp_invalidate_interrupt(struct pt_regs *regs) -{ - unsigned int cpu; - unsigned int sender; - union smp_flush_state *f; - - cpu = smp_processor_id(); - /* - * orig_rax contains the negated interrupt vector. - * Use that to determine where the sender put the data. - */ - sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; - f = &per_cpu(flush_state, sender); - - if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask))) - goto out; - /* - * This was a BUG() but until someone can quote me the - * line from the intel manual that guarantees an IPI to - * multiple CPUs is retried _only_ on the erroring CPUs - * its staying as a return - * - * BUG(); - */ - - if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { - if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { - if (f->flush_va == TLB_FLUSH_ALL) - local_flush_tlb(); - else - __flush_tlb_one(f->flush_va); - } else - leave_mm(cpu); - } -out: - ack_APIC_irq(); - smp_mb__before_clear_bit(); - cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask)); - smp_mb__after_clear_bit(); - inc_irq_stat(irq_tlb_count); -} - -static void flush_tlb_others_ipi(const struct cpumask *cpumask, - struct mm_struct *mm, unsigned long va) -{ - unsigned int sender; - union smp_flush_state *f; - - /* Caller has disabled preemption */ - sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; - f = &per_cpu(flush_state, sender); - - /* - * Could avoid this lock when - * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is - * probably not worth checking this for a cache-hot lock. - */ - spin_lock(&f->tlbstate_lock); - - f->flush_mm = mm; - f->flush_va = va; - cpumask_andnot(to_cpumask(f->flush_cpumask), - cpumask, cpumask_of(smp_processor_id())); - - /* - * Make the above memory operations globally visible before - * sending the IPI. - */ - smp_mb(); - /* - * We have to send the IPI only to - * CPUs affected. - */ - send_IPI_mask(to_cpumask(f->flush_cpumask), - INVALIDATE_TLB_VECTOR_START + sender); - - while (!cpumask_empty(to_cpumask(f->flush_cpumask))) - cpu_relax(); - - f->flush_mm = NULL; - f->flush_va = 0; - spin_unlock(&f->tlbstate_lock); -} - -void native_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, unsigned long va) -{ - if (is_uv_system()) { - unsigned int cpu; - - cpu = get_cpu(); - cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); - if (cpumask) - flush_tlb_others_ipi(cpumask, mm, va); - put_cpu(); - return; - } - flush_tlb_others_ipi(cpumask, mm, va); -} - -static int __cpuinit init_smp_flush(void) -{ - int i; - - for_each_possible_cpu(i) - spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); - - return 0; -} -core_initcall(init_smp_flush); - -void flush_tlb_current_task(void) -{ - struct mm_struct *mm = current->mm; - - preempt_disable(); - - local_flush_tlb(); - if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); - preempt_enable(); -} - -void flush_tlb_mm(struct mm_struct *mm) -{ - preempt_disable(); - - if (current->active_mm == mm) { - if (current->mm) - local_flush_tlb(); - else - leave_mm(smp_processor_id()); - } - if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); - - preempt_enable(); -} - -void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) -{ - struct mm_struct *mm = vma->vm_mm; - - preempt_disable(); - - if (current->active_mm == mm) { - if (current->mm) - __flush_tlb_one(va); - else - leave_mm(smp_processor_id()); - } - - if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(&mm->cpu_vm_mask, mm, va); - - preempt_enable(); -} - -static void do_flush_tlb_all(void *info) -{ - unsigned long cpu = smp_processor_id(); - - __flush_tlb_all(); - if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) - leave_mm(cpu); -} - -void flush_tlb_all(void) -{ - on_each_cpu(do_flush_tlb_all, NULL, 1); -} diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index d8cc96a2738..9f05157220f 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,6 +1,8 @@ obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o gup.o +obj-$(CONFIG_X86_SMP) += tlb.o + obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c new file mode 100644 index 00000000000..b3ca1b94065 --- /dev/null +++ b/arch/x86/mm/tlb.c @@ -0,0 +1,296 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) + = { &init_mm, 0, }; + +#include +/* + * Smarter SMP flushing macros. + * c/o Linus Torvalds. + * + * These mean you can really definitely utterly forget about + * writing to user space from interrupts. (Its not allowed anyway). + * + * Optimizations Manfred Spraul + * + * More scalable flush, from Andi Kleen + * + * To avoid global state use 8 different call vectors. + * Each CPU uses a specific vector to trigger flushes on other + * CPUs. Depending on the received vector the target CPUs look into + * the right per cpu variable for the flush data. + * + * With more than 8 CPUs they are hashed to the 8 available + * vectors. The limited global vector space forces us to this right now. + * In future when interrupts are split into per CPU domains this could be + * fixed, at the cost of triggering multiple IPIs in some cases. + */ + +union smp_flush_state { + struct { + struct mm_struct *flush_mm; + unsigned long flush_va; + spinlock_t tlbstate_lock; + DECLARE_BITMAP(flush_cpumask, NR_CPUS); + }; + char pad[SMP_CACHE_BYTES]; +} ____cacheline_aligned; + +/* State is put into the per CPU data section, but padded + to a full cache line because other CPUs can access it and we don't + want false sharing in the per cpu data segment. */ +static DEFINE_PER_CPU(union smp_flush_state, flush_state); + +/* + * We cannot call mmdrop() because we are in interrupt context, + * instead update mm->cpu_vm_mask. + */ +void leave_mm(int cpu) +{ + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + BUG(); + cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); + load_cr3(swapper_pg_dir); +} +EXPORT_SYMBOL_GPL(leave_mm); + +/* + * + * The flush IPI assumes that a thread switch happens in this order: + * [cpu0: the cpu that switches] + * 1) switch_mm() either 1a) or 1b) + * 1a) thread switch to a different mm + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); + * Stop ipi delivery for the old mm. This is not synchronized with + * the other cpus, but smp_invalidate_interrupt ignore flush ipis + * for the wrong mm, and in the worst case we perform a superfluous + * tlb flush. + * 1a2) set cpu mmu_state to TLBSTATE_OK + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 + * was in lazy tlb mode. + * 1a3) update cpu active_mm + * Now cpu0 accepts tlb flushes for the new mm. + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); + * Now the other cpus will send tlb flush ipis. + * 1a4) change cr3. + * 1b) thread switch without mm change + * cpu active_mm is correct, cpu0 already handles + * flush ipis. + * 1b1) set cpu mmu_state to TLBSTATE_OK + * 1b2) test_and_set the cpu bit in cpu_vm_mask. + * Atomically set the bit [other cpus will start sending flush ipis], + * and test the bit. + * 1b3) if the bit was 0: leave_mm was called, flush the tlb. + * 2) switch %%esp, ie current + * + * The interrupt must handle 2 special cases: + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. + * - the cpu performs speculative tlb reads, i.e. even if the cpu only + * runs in kernel space, the cpu could load tlb entries for user space + * pages. + * + * The good news is that cpu mmu_state is local to each cpu, no + * write/read ordering problems. + */ + +/* + * TLB flush IPI: + * + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. + * 2) Leave the mm if we are in the lazy tlb mode. + * + * Interrupts are disabled. + */ + +/* + * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop + * but still used for documentation purpose but the usage is slightly + * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt + * entry calls in with the first parameter in %eax. Maybe define + * intrlinkage? + */ +#ifdef CONFIG_X86_64 +asmlinkage +#endif +void smp_invalidate_interrupt(struct pt_regs *regs) +{ + unsigned int cpu; + unsigned int sender; + union smp_flush_state *f; + + cpu = smp_processor_id(); + /* + * orig_rax contains the negated interrupt vector. + * Use that to determine where the sender put the data. + */ + sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; + f = &per_cpu(flush_state, sender); + + if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask))) + goto out; + /* + * This was a BUG() but until someone can quote me the + * line from the intel manual that guarantees an IPI to + * multiple CPUs is retried _only_ on the erroring CPUs + * its staying as a return + * + * BUG(); + */ + + if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { + if (f->flush_va == TLB_FLUSH_ALL) + local_flush_tlb(); + else + __flush_tlb_one(f->flush_va); + } else + leave_mm(cpu); + } +out: + ack_APIC_irq(); + smp_mb__before_clear_bit(); + cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask)); + smp_mb__after_clear_bit(); + inc_irq_stat(irq_tlb_count); +} + +static void flush_tlb_others_ipi(const struct cpumask *cpumask, + struct mm_struct *mm, unsigned long va) +{ + unsigned int sender; + union smp_flush_state *f; + + /* Caller has disabled preemption */ + sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; + f = &per_cpu(flush_state, sender); + + /* + * Could avoid this lock when + * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is + * probably not worth checking this for a cache-hot lock. + */ + spin_lock(&f->tlbstate_lock); + + f->flush_mm = mm; + f->flush_va = va; + cpumask_andnot(to_cpumask(f->flush_cpumask), + cpumask, cpumask_of(smp_processor_id())); + + /* + * Make the above memory operations globally visible before + * sending the IPI. + */ + smp_mb(); + /* + * We have to send the IPI only to + * CPUs affected. + */ + send_IPI_mask(to_cpumask(f->flush_cpumask), + INVALIDATE_TLB_VECTOR_START + sender); + + while (!cpumask_empty(to_cpumask(f->flush_cpumask))) + cpu_relax(); + + f->flush_mm = NULL; + f->flush_va = 0; + spin_unlock(&f->tlbstate_lock); +} + +void native_flush_tlb_others(const struct cpumask *cpumask, + struct mm_struct *mm, unsigned long va) +{ + if (is_uv_system()) { + unsigned int cpu; + + cpu = get_cpu(); + cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); + if (cpumask) + flush_tlb_others_ipi(cpumask, mm, va); + put_cpu(); + return; + } + flush_tlb_others_ipi(cpumask, mm, va); +} + +static int __cpuinit init_smp_flush(void) +{ + int i; + + for_each_possible_cpu(i) + spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); + + return 0; +} +core_initcall(init_smp_flush); + +void flush_tlb_current_task(void) +{ + struct mm_struct *mm = current->mm; + + preempt_disable(); + + local_flush_tlb(); + if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); + preempt_enable(); +} + +void flush_tlb_mm(struct mm_struct *mm) +{ + preempt_disable(); + + if (current->active_mm == mm) { + if (current->mm) + local_flush_tlb(); + else + leave_mm(smp_processor_id()); + } + if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); + + preempt_enable(); +} + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) +{ + struct mm_struct *mm = vma->vm_mm; + + preempt_disable(); + + if (current->active_mm == mm) { + if (current->mm) + __flush_tlb_one(va); + else + leave_mm(smp_processor_id()); + } + + if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(&mm->cpu_vm_mask, mm, va); + + preempt_enable(); +} + +static void do_flush_tlb_all(void *info) +{ + unsigned long cpu = smp_processor_id(); + + __flush_tlb_all(); + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) + leave_mm(cpu); +} + +void flush_tlb_all(void) +{ + on_each_cpu(do_flush_tlb_all, NULL, 1); +} -- cgit v1.2.3-70-g09d2 From 4ec71fa2d2c3f1040348f2604f4b8ccc833d1c2e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 21 Jan 2009 10:24:27 +0100 Subject: x86: uv cleanup, build fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix: arch/x86/mm/srat_64.c: In function ‘acpi_numa_processor_affinity_init’: arch/x86/mm/srat_64.c:141: error: implicit declaration of function ‘get_uv_system_type’ arch/x86/mm/srat_64.c:141: error: ‘UV_X2APIC’ undeclared (first use in this function) arch/x86/mm/srat_64.c:141: error: (Each undeclared identifier is reported only once arch/x86/mm/srat_64.c:141: error: for each function it appears in.) A couple of UV definitions were moved to asm/uv/uv.h, but srat_64.c did not include that header. Add it. Signed-off-by: Ingo Molnar --- arch/x86/mm/srat_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 09737c8af07..15df1baee10 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -21,6 +21,7 @@ #include #include #include +#include int acpi_numa __initdata; -- cgit v1.2.3-70-g09d2 From fb746d0e1365b7472ccc4c3d5b0672b34a092d0b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 21 Jan 2009 20:45:41 +0100 Subject: x86: optimise page fault entry, cleanup tsk is already assigned to current, drop the redundant second assignment. Signed-off-by: Johannes Weiner Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 033292dc9e2..8f4b859a04b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -419,7 +419,6 @@ static noinline void pgtable_bad(struct pt_regs *regs, printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); - tsk = current; tsk->thread.cr2 = address; tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; -- cgit v1.2.3-70-g09d2 From d639bab8da86d330493487e8c0fea8ca31f53427 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 9 Jan 2009 16:13:13 -0800 Subject: x86 PAT: ioremap_wc should take resource_size_t parameter Impact: fix/extend ioremap_wc() beyond 4GB aperture on 32-bit ioremap_wc() was taking in unsigned long parameter, where as it should take 64-bit resource_size_t parameter like other ioremap variants. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 2 +- arch/x86/mm/ioremap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 05cfed4485f..bdbb4b96160 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -91,7 +91,7 @@ extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, unsigned long prot_val); -extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size); +extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); /* * early_ioremap() and early_iounmap() are for temporary early boot-time diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index bd85d42819e..2ddb1e79a19 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -367,7 +367,7 @@ EXPORT_SYMBOL(ioremap_nocache); * * Must be freed with iounmap. */ -void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) +void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) { if (pat_enabled) return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, -- cgit v1.2.3-70-g09d2 From fe40c0af3cff3ea461cf25bddb979abc7279d4df Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 23 Jan 2009 15:49:41 -0800 Subject: x86: uaccess: introduce try and catch framework Impact: introduce new uaccess exception handling framework Introduce {get|put}_user_try and {get|put}_user_catch as new uaccess exception handling framework. {get|put}_user_try begins exception block and {get|put}_user_catch(err) ends the block and gets err if an exception occured in {get|put}_user_ex() in the block. The exception is stored thread_info->uaccess_err. The example usage of this framework is below; int func() { int err = 0; get_user_try { get_user_ex(...); get_user_ex(...); : } get_user_catch(err); return err; } Note: get_user_ex() is not clear the value when an exception occurs, it's different from the behavior of __get_user(), but I think it doesn't matter. Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/thread_info.h | 1 + arch/x86/include/asm/uaccess.h | 103 +++++++++++++++++++++++++++++++++++++ arch/x86/mm/extable.c | 6 +++ 3 files changed, 110 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 98789647baa..3f90aeb456b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -40,6 +40,7 @@ struct thread_info { */ __u8 supervisor_stack[0]; #endif + int uaccess_err; }; #define INIT_THREAD_INFO(tsk) \ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 69d2757cca9..0ec6de4bcb0 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -199,12 +199,22 @@ extern int __get_user_bad(void); : "=r" (err) \ : "A" (x), "r" (addr), "i" (-EFAULT), "0" (err)) +#define __put_user_asm_ex_u64(x, addr) \ + asm volatile("1: movl %%eax,0(%1)\n" \ + "2: movl %%edx,4(%1)\n" \ + "3:\n" \ + _ASM_EXTABLE(1b, 2b - 1b) \ + _ASM_EXTABLE(2b, 3b - 2b) \ + : : "A" (x), "r" (addr)) + #define __put_user_x8(x, ptr, __ret_pu) \ asm volatile("call __put_user_8" : "=a" (__ret_pu) \ : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") #else #define __put_user_asm_u64(x, ptr, retval) \ __put_user_asm(x, ptr, retval, "q", "", "Zr", -EFAULT) +#define __put_user_asm_ex_u64(x, addr) \ + __put_user_asm_ex(x, addr, "q", "", "Zr") #define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) #endif @@ -286,6 +296,27 @@ do { \ } \ } while (0) +#define __put_user_size_ex(x, ptr, size) \ +do { \ + __chk_user_ptr(ptr); \ + switch (size) { \ + case 1: \ + __put_user_asm_ex(x, ptr, "b", "b", "iq"); \ + break; \ + case 2: \ + __put_user_asm_ex(x, ptr, "w", "w", "ir"); \ + break; \ + case 4: \ + __put_user_asm_ex(x, ptr, "l", "k", "ir"); \ + break; \ + case 8: \ + __put_user_asm_ex_u64((__typeof__(*ptr))(x), ptr); \ + break; \ + default: \ + __put_user_bad(); \ + } \ +} while (0) + #else #define __put_user_size(x, ptr, size, retval, errret) \ @@ -311,9 +342,12 @@ do { \ #ifdef CONFIG_X86_32 #define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad() +#define __get_user_asm_ex_u64(x, ptr) (x) = __get_user_bad() #else #define __get_user_asm_u64(x, ptr, retval, errret) \ __get_user_asm(x, ptr, retval, "q", "", "=r", errret) +#define __get_user_asm_ex_u64(x, ptr) \ + __get_user_asm_ex(x, ptr, "q", "", "=r") #endif #define __get_user_size(x, ptr, size, retval, errret) \ @@ -350,6 +384,33 @@ do { \ : "=r" (err), ltype(x) \ : "m" (__m(addr)), "i" (errret), "0" (err)) +#define __get_user_size_ex(x, ptr, size) \ +do { \ + __chk_user_ptr(ptr); \ + switch (size) { \ + case 1: \ + __get_user_asm_ex(x, ptr, "b", "b", "=q"); \ + break; \ + case 2: \ + __get_user_asm_ex(x, ptr, "w", "w", "=r"); \ + break; \ + case 4: \ + __get_user_asm_ex(x, ptr, "l", "k", "=r"); \ + break; \ + case 8: \ + __get_user_asm_ex_u64(x, ptr); \ + break; \ + default: \ + (x) = __get_user_bad(); \ + } \ +} while (0) + +#define __get_user_asm_ex(x, addr, itype, rtype, ltype) \ + asm volatile("1: mov"itype" %1,%"rtype"0\n" \ + "2:\n" \ + _ASM_EXTABLE(1b, 2b - 1b) \ + : ltype(x) : "m" (__m(addr))) + #define __put_user_nocheck(x, ptr, size) \ ({ \ int __pu_err; \ @@ -385,6 +446,26 @@ struct __large_struct { unsigned long buf[100]; }; _ASM_EXTABLE(1b, 3b) \ : "=r"(err) \ : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err)) + +#define __put_user_asm_ex(x, addr, itype, rtype, ltype) \ + asm volatile("1: mov"itype" %"rtype"0,%1\n" \ + "2:\n" \ + _ASM_EXTABLE(1b, 2b - 1b) \ + : : ltype(x), "m" (__m(addr))) + +/* + * uaccess_try and catch + */ +#define uaccess_try do { \ + int prev_err = current_thread_info()->uaccess_err; \ + current_thread_info()->uaccess_err = 0; \ + barrier(); + +#define uaccess_catch(err) \ + (err) |= current_thread_info()->uaccess_err; \ + current_thread_info()->uaccess_err = prev_err; \ +} while (0) + /** * __get_user: - Get a simple variable from user space, with less checking. * @x: Variable to store result. @@ -408,6 +489,7 @@ struct __large_struct { unsigned long buf[100]; }; #define __get_user(x, ptr) \ __get_user_nocheck((x), (ptr), sizeof(*(ptr))) + /** * __put_user: - Write a simple value into user space, with less checking. * @x: Value to copy to user space. @@ -434,6 +516,27 @@ struct __large_struct { unsigned long buf[100]; }; #define __get_user_unaligned __get_user #define __put_user_unaligned __put_user +/* + * {get|put}_user_try and catch + * + * get_user_try { + * get_user_ex(...); + * } get_user_catch(err) + */ +#define get_user_try uaccess_try +#define get_user_catch(err) uaccess_catch(err) +#define put_user_try uaccess_try +#define put_user_catch(err) uaccess_catch(err) + +#define get_user_ex(x, ptr) do { \ + unsigned long __gue_val; \ + __get_user_size_ex((__gue_val), (ptr), (sizeof(*(ptr)))); \ + (x) = (__force __typeof__(*(ptr)))__gue_val; \ +} while (0) + +#define put_user_ex(x, ptr) \ + __put_user_size_ex((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) + /* * movsl can be slow when source and dest are not both 8-byte aligned */ diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 7e8db53528a..61b41ca3b5a 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -23,6 +23,12 @@ int fixup_exception(struct pt_regs *regs) fixup = search_exception_tables(regs->ip); if (fixup) { + /* If fixup is less than 16, it means uaccess error */ + if (fixup->fixup < 16) { + current_thread_info()->uaccess_err = -EFAULT; + regs->ip += fixup->fixup; + return 1; + } regs->ip = fixup->fixup; return 1; } -- cgit v1.2.3-70-g09d2 From 75a048119e76540d73132cfc8e0fa0c0a8bb6c83 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 22 Jan 2009 16:17:05 -0800 Subject: x86: handle PAT more like other CPU features Impact: Cleanup When PAT was originally introduced, it was handled specially for a few reasons: - PAT bugs are hard to track down, so we wanted to maintain a whitelist of CPUs. - The i386 and x86-64 CPUID code was not yet unified. Both of these are now obsolete, so handle PAT like any other features, including ordinary feature blacklisting due to known bugs. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pat.h | 4 ---- arch/x86/kernel/cpu/addon_cpuid_features.c | 34 ------------------------------ arch/x86/kernel/cpu/common.c | 2 -- arch/x86/kernel/cpu/intel.c | 12 +++++++++++ arch/x86/mm/pat.c | 31 +++++++++++++++++---------- 5 files changed, 32 insertions(+), 51 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index b8493b3b989..9709fdff661 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -5,10 +5,8 @@ #ifdef CONFIG_X86_PAT extern int pat_enabled; -extern void validate_pat_support(struct cpuinfo_x86 *c); #else static const int pat_enabled; -static inline void validate_pat_support(struct cpuinfo_x86 *c) { } #endif extern void pat_init(void); @@ -17,6 +15,4 @@ extern int reserve_memtype(u64 start, u64 end, unsigned long req_type, unsigned long *ret_type); extern int free_memtype(u64 start, u64 end); -extern void pat_disable(char *reason); - #endif /* _ASM_X86_PAT_H */ diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 2cf23634b6d..4e581fdc0a5 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -143,37 +143,3 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) return; #endif } - -#ifdef CONFIG_X86_PAT -void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) -{ - if (!cpu_has_pat) - pat_disable("PAT not supported by CPU."); - - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - /* - * There is a known erratum on Pentium III and Core Solo - * and Core Duo CPUs. - * " Page with PAT set to WC while associated MTRR is UC - * may consolidate to UC " - * Because of this erratum, it is better to stick with - * setting WC in MTRR rather than using PAT on these CPUs. - * - * Enable PAT WC only on P4, Core 2 or later CPUs. - */ - if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15)) - return; - - pat_disable("PAT WC disabled due to known CPU erratum."); - return; - - case X86_VENDOR_AMD: - case X86_VENDOR_CENTAUR: - case X86_VENDOR_TRANSMETA: - return; - } - - pat_disable("PAT disabled. Not yet verified on this CPU type."); -} -#endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 83492b1f93b..0f8656361e0 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -570,8 +570,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_early_init) this_cpu->c_early_init(c); - validate_pat_support(c); - #ifdef CONFIG_SMP c->cpu_index = boot_cpu_id; #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 8ea6929e974..20ce03acf04 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -50,6 +50,18 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); } + /* + * There is a known erratum on Pentium III and Core Solo + * and Core Duo CPUs. + * " Page with PAT set to WC while associated MTRR is UC + * may consolidate to UC " + * Because of this erratum, it is better to stick with + * setting WC in MTRR rather than using PAT on these CPUs. + * + * Enable PAT WC only on P4, Core 2 or later CPUs. + */ + if (c->x86 == 6 && c->x86_model < 15) + clear_cpu_cap(c, X86_FEATURE_PAT); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 8b08fb95527..430cb44dd3f 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -30,7 +30,7 @@ #ifdef CONFIG_X86_PAT int __read_mostly pat_enabled = 1; -void __cpuinit pat_disable(char *reason) +void __cpuinit pat_disable(const char *reason) { pat_enabled = 0; printk(KERN_INFO "%s\n", reason); @@ -42,6 +42,11 @@ static int __init nopat(char *str) return 0; } early_param("nopat", nopat); +#else +static inline void pat_disable(const char *reason) +{ + (void)reason; +} #endif @@ -78,16 +83,20 @@ void pat_init(void) if (!pat_enabled) return; - /* Paranoia check. */ - if (!cpu_has_pat && boot_pat_state) { - /* - * If this happens we are on a secondary CPU, but - * switched to PAT on the boot CPU. We have no way to - * undo PAT. - */ - printk(KERN_ERR "PAT enabled, " - "but not supported by secondary CPU\n"); - BUG(); + if (!cpu_has_pat) { + if (!boot_pat_state) { + pat_disable("PAT not supported by CPU."); + return; + } else { + /* + * If this happens we are on a secondary CPU, but + * switched to PAT on the boot CPU. We have no way to + * undo PAT. + */ + printk(KERN_ERR "PAT enabled, " + "but not supported by secondary CPU\n"); + BUG(); + } } /* Set PWT to Write-Combining. All other bits stay the same */ -- cgit v1.2.3-70-g09d2 From 6470aff619fbb9dff8dfe8afa5033084cd55ca20 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 27 Jan 2009 12:56:47 +0900 Subject: x86: move 64-bit NUMA code Impact: Code movement, no functional change. Move the 64-bit NUMA code from setup_percpu.c to numa_64.c Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/include/asm/topology.h | 6 + arch/x86/kernel/setup_percpu.c | 237 +--------------------------------------- arch/x86/mm/numa_64.c | 217 ++++++++++++++++++++++++++++++++++++ 3 files changed, 228 insertions(+), 232 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 10022ed3a4b..77cfb2cfb38 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -74,6 +74,8 @@ static inline const struct cpumask *cpumask_of_node(int node) return &node_to_cpumask_map[node]; } +static inline void setup_node_to_cpumask_map(void) { } + #else /* CONFIG_X86_64 */ /* Mappings between node number and cpus on that node. */ @@ -120,6 +122,8 @@ static inline cpumask_t node_to_cpumask(int node) #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ +extern void setup_node_to_cpumask_map(void); + /* * Replace default node_to_cpumask_ptr with optimized version * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" @@ -218,6 +222,8 @@ static inline int node_to_first_cpu(int node) return first_cpu(cpu_online_map); } +static inline void setup_node_to_cpumask_map(void) { } + /* * Replace default node_to_cpumask_ptr with optimized version * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d0b1476490a..cb6d622520b 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -51,32 +51,6 @@ DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) -#define X86_64_NUMA 1 /* (used later) */ -DEFINE_PER_CPU(int, node_number) = 0; -EXPORT_PER_CPU_SYMBOL(node_number); - -/* - * Map cpu index to node index - */ -DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); - -/* - * Which logical CPUs are on which nodes - */ -cpumask_t *node_to_cpumask_map; -EXPORT_SYMBOL(node_to_cpumask_map); - -/* - * Setup node_to_cpumask_map - */ -static void __init setup_node_to_cpumask_map(void); - -#else -static inline void setup_node_to_cpumask_map(void) { } -#endif - #ifdef CONFIG_X86_64 /* correctly size the local cpu masks */ @@ -163,13 +137,13 @@ void __init setup_per_cpu_areas(void) early_per_cpu_map(x86_cpu_to_apicid, cpu); per_cpu(x86_bios_cpu_apicid, cpu) = early_per_cpu_map(x86_bios_cpu_apicid, cpu); -#ifdef X86_64_NUMA - per_cpu(x86_cpu_to_node_map, cpu) = - early_per_cpu_map(x86_cpu_to_node_map, cpu); -#endif #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE - 64; +#ifdef CONFIG_NUMA + per_cpu(x86_cpu_to_node_map, cpu) = + early_per_cpu_map(x86_cpu_to_node_map, cpu); +#endif /* * Up to this point, CPU0 has been using .data.init * area. Reload %gs offset for CPU0. @@ -184,7 +158,7 @@ void __init setup_per_cpu_areas(void) /* indicate the early static arrays will soon be gone */ early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; -#ifdef X86_64_NUMA +#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif @@ -197,204 +171,3 @@ void __init setup_per_cpu_areas(void) #endif -#ifdef X86_64_NUMA - -/* - * Allocate node_to_cpumask_map based on number of available nodes - * Requires node_possible_map to be valid. - * - * Note: node_to_cpumask() is not valid until after this is done. - * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) - */ -static void __init setup_node_to_cpumask_map(void) -{ - unsigned int node, num = 0; - cpumask_t *map; - - /* setup nr_node_ids if not done yet */ - if (nr_node_ids == MAX_NUMNODES) { - for_each_node_mask(node, node_possible_map) - num = node; - nr_node_ids = num + 1; - } - - /* allocate the map */ - map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); - DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids); - - pr_debug("Node to cpumask map at %p for %d nodes\n", - map, nr_node_ids); - - /* node_to_cpumask() will now work */ - node_to_cpumask_map = map; -} - -void __cpuinit numa_set_node(int cpu, int node) -{ - int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); - - /* early setting, no percpu area yet */ - if (cpu_to_node_map) { - cpu_to_node_map[cpu] = node; - return; - } - -#ifdef CONFIG_DEBUG_PER_CPU_MAPS - if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) { - printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); - dump_stack(); - return; - } -#endif - per_cpu(x86_cpu_to_node_map, cpu) = node; - - if (node != NUMA_NO_NODE) - per_cpu(node_number, cpu) = node; -} - -void __cpuinit numa_clear_node(int cpu) -{ - numa_set_node(cpu, NUMA_NO_NODE); -} - -#ifndef CONFIG_DEBUG_PER_CPU_MAPS - -void __cpuinit numa_add_cpu(int cpu) -{ - cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); -} - -void __cpuinit numa_remove_cpu(int cpu) -{ - cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); -} - -#else /* CONFIG_DEBUG_PER_CPU_MAPS */ - -/* - * --------- debug versions of the numa functions --------- - */ -static void __cpuinit numa_set_cpumask(int cpu, int enable) -{ - int node = early_cpu_to_node(cpu); - cpumask_t *mask; - char buf[64]; - - if (node_to_cpumask_map == NULL) { - printk(KERN_ERR "node_to_cpumask_map NULL\n"); - dump_stack(); - return; - } - - mask = &node_to_cpumask_map[node]; - if (enable) - cpu_set(cpu, *mask); - else - cpu_clear(cpu, *mask); - - cpulist_scnprintf(buf, sizeof(buf), mask); - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", - enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); -} - -void __cpuinit numa_add_cpu(int cpu) -{ - numa_set_cpumask(cpu, 1); -} - -void __cpuinit numa_remove_cpu(int cpu) -{ - numa_set_cpumask(cpu, 0); -} - -int cpu_to_node(int cpu) -{ - if (early_per_cpu_ptr(x86_cpu_to_node_map)) { - printk(KERN_WARNING - "cpu_to_node(%d): usage too early!\n", cpu); - dump_stack(); - return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - } - return per_cpu(x86_cpu_to_node_map, cpu); -} -EXPORT_SYMBOL(cpu_to_node); - -/* - * Same function as cpu_to_node() but used if called before the - * per_cpu areas are setup. - */ -int early_cpu_to_node(int cpu) -{ - if (early_per_cpu_ptr(x86_cpu_to_node_map)) - return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - - if (!per_cpu_offset(cpu)) { - printk(KERN_WARNING - "early_cpu_to_node(%d): no per_cpu area!\n", cpu); - dump_stack(); - return NUMA_NO_NODE; - } - return per_cpu(x86_cpu_to_node_map, cpu); -} - - -/* empty cpumask */ -static const cpumask_t cpu_mask_none; - -/* - * Returns a pointer to the bitmask of CPUs on Node 'node'. - */ -const cpumask_t *cpumask_of_node(int node) -{ - if (node_to_cpumask_map == NULL) { - printk(KERN_WARNING - "cpumask_of_node(%d): no node_to_cpumask_map!\n", - node); - dump_stack(); - return (const cpumask_t *)&cpu_online_map; - } - if (node >= nr_node_ids) { - printk(KERN_WARNING - "cpumask_of_node(%d): node > nr_node_ids(%d)\n", - node, nr_node_ids); - dump_stack(); - return &cpu_mask_none; - } - return &node_to_cpumask_map[node]; -} -EXPORT_SYMBOL(cpumask_of_node); - -/* - * Returns a bitmask of CPUs on Node 'node'. - * - * Side note: this function creates the returned cpumask on the stack - * so with a high NR_CPUS count, excessive stack space is used. The - * node_to_cpumask_ptr function should be used whenever possible. - */ -cpumask_t node_to_cpumask(int node) -{ - if (node_to_cpumask_map == NULL) { - printk(KERN_WARNING - "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); - dump_stack(); - return cpu_online_map; - } - if (node >= nr_node_ids) { - printk(KERN_WARNING - "node_to_cpumask(%d): node > nr_node_ids(%d)\n", - node, nr_node_ids); - dump_stack(); - return cpu_mask_none; - } - return node_to_cpumask_map[node]; -} -EXPORT_SYMBOL(node_to_cpumask); - -/* - * --------- end of debug versions of the numa functions --------- - */ - -#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ - -#endif /* X86_64_NUMA */ - diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 71a14f89f89..08d140fbc31 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -20,6 +20,12 @@ #include #include +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +# define DBG(x...) printk(KERN_DEBUG x) +#else +# define DBG(x...) +#endif + struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -33,6 +39,21 @@ int numa_off __initdata; static unsigned long __initdata nodemap_addr; static unsigned long __initdata nodemap_size; +DEFINE_PER_CPU(int, node_number) = 0; +EXPORT_PER_CPU_SYMBOL(node_number); + +/* + * Map cpu index to node index + */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +/* + * Which logical CPUs are on which nodes + */ +cpumask_t *node_to_cpumask_map; +EXPORT_SYMBOL(node_to_cpumask_map); + /* * Given a shift value, try to populate memnodemap[] * Returns : @@ -640,3 +661,199 @@ void __init init_cpu_to_node(void) #endif +/* + * Allocate node_to_cpumask_map based on number of available nodes + * Requires node_possible_map to be valid. + * + * Note: node_to_cpumask() is not valid until after this is done. + * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) + */ +void __init setup_node_to_cpumask_map(void) +{ + unsigned int node, num = 0; + cpumask_t *map; + + /* setup nr_node_ids if not done yet */ + if (nr_node_ids == MAX_NUMNODES) { + for_each_node_mask(node, node_possible_map) + num = node; + nr_node_ids = num + 1; + } + + /* allocate the map */ + map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); + DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids); + + pr_debug("Node to cpumask map at %p for %d nodes\n", + map, nr_node_ids); + + /* node_to_cpumask() will now work */ + node_to_cpumask_map = map; +} + +void __cpuinit numa_set_node(int cpu, int node) +{ + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + + /* early setting, no percpu area yet */ + if (cpu_to_node_map) { + cpu_to_node_map[cpu] = node; + return; + } + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) { + printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); + dump_stack(); + return; + } +#endif + per_cpu(x86_cpu_to_node_map, cpu) = node; + + if (node != NUMA_NO_NODE) + per_cpu(node_number, cpu) = node; +} + +void __cpuinit numa_clear_node(int cpu) +{ + numa_set_node(cpu, NUMA_NO_NODE); +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + +void __cpuinit numa_add_cpu(int cpu) +{ + cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +#else /* CONFIG_DEBUG_PER_CPU_MAPS */ + +/* + * --------- debug versions of the numa functions --------- + */ +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ + int node = early_cpu_to_node(cpu); + cpumask_t *mask; + char buf[64]; + + if (node_to_cpumask_map == NULL) { + printk(KERN_ERR "node_to_cpumask_map NULL\n"); + dump_stack(); + return; + } + + mask = &node_to_cpumask_map[node]; + if (enable) + cpu_set(cpu, *mask); + else + cpu_clear(cpu, *mask); + + cpulist_scnprintf(buf, sizeof(buf), mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); +} + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, 1); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, 0); +} + +int cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) { + printk(KERN_WARNING + "cpu_to_node(%d): usage too early!\n", cpu); + dump_stack(); + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(cpu_to_node); + +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ +int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + if (!per_cpu_offset(cpu)) { + printk(KERN_WARNING + "early_cpu_to_node(%d): no per_cpu area!\n", cpu); + dump_stack(); + return NUMA_NO_NODE; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} + + +/* empty cpumask */ +static const cpumask_t cpu_mask_none; + +/* + * Returns a pointer to the bitmask of CPUs on Node 'node'. + */ +const cpumask_t *cpumask_of_node(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "cpumask_of_node(%d): no node_to_cpumask_map!\n", + node); + dump_stack(); + return (const cpumask_t *)&cpu_online_map; + } + if (node >= nr_node_ids) { + printk(KERN_WARNING + "cpumask_of_node(%d): node > nr_node_ids(%d)\n", + node, nr_node_ids); + dump_stack(); + return &cpu_mask_none; + } + return &node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(cpumask_of_node); + +/* + * Returns a bitmask of CPUs on Node 'node'. + * + * Side note: this function creates the returned cpumask on the stack + * so with a high NR_CPUS count, excessive stack space is used. The + * node_to_cpumask_ptr function should be used whenever possible. + */ +cpumask_t node_to_cpumask(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); + dump_stack(); + return cpu_online_map; + } + if (node >= nr_node_ids) { + printk(KERN_WARNING + "node_to_cpumask(%d): node > nr_node_ids(%d)\n", + node, nr_node_ids); + dump_stack(); + return cpu_mask_none; + } + return node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(node_to_cpumask); + +/* + * --------- end of debug versions of the numa functions --------- + */ + +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ -- cgit v1.2.3-70-g09d2 From dac5f4121df3c39fdb2ea57acd669a0ae19e46f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 Jan 2009 15:42:24 +0100 Subject: x86, apic: untangle the send_IPI_*() jungle Our send_IPI_*() methods and definitions are a twisted mess: the same symbol is defined to different things depending on .config details, in a non-transparent way. - spread out the quirks into separately named per apic driver methods - prefix the standard PC methods with default_ - get rid of wrapper macro obfuscation - clean up various details Signed-off-by: Ingo Molnar --- arch/x86/include/asm/bigsmp/ipi.h | 16 +++++------ arch/x86/include/asm/es7000/ipi.h | 16 +++++------ arch/x86/include/asm/hw_irq.h | 4 +-- arch/x86/include/asm/ipi.h | 38 +++++++++++++------------ arch/x86/include/asm/mach-default/mach_apic.h | 1 - arch/x86/include/asm/mach-default/mach_ipi.h | 40 ++++++++++++--------------- arch/x86/include/asm/mach-generic/mach_ipi.h | 4 --- arch/x86/include/asm/numaq/ipi.h | 16 +++++------ arch/x86/include/asm/summit/ipi.h | 16 +++++------ arch/x86/kernel/apic.c | 2 +- arch/x86/kernel/genapic_64.c | 2 +- arch/x86/kernel/genapic_flat_64.c | 24 +++++++++------- arch/x86/kernel/genx2apic_cluster.c | 38 +++++++++++++------------ arch/x86/kernel/genx2apic_phys.c | 36 ++++++++++-------------- arch/x86/kernel/genx2apic_uv_x.c | 21 ++++++++------ arch/x86/kernel/io_apic.c | 10 +++---- arch/x86/kernel/ipi.c | 28 ++++++++++--------- arch/x86/kernel/kgdb.c | 2 +- arch/x86/kernel/reboot.c | 2 +- arch/x86/kernel/smp.c | 10 +++---- arch/x86/mach-generic/bigsmp.c | 6 ++-- arch/x86/mach-generic/default.c | 6 ++-- arch/x86/mach-generic/es7000.c | 6 ++-- arch/x86/mach-generic/numaq.c | 6 ++-- arch/x86/mach-generic/summit.c | 6 ++-- arch/x86/mm/tlb.c | 2 +- 26 files changed, 178 insertions(+), 180 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/bigsmp/ipi.h b/arch/x86/include/asm/bigsmp/ipi.h index 27fcd01b3ae..a91db69cda6 100644 --- a/arch/x86/include/asm/bigsmp/ipi.h +++ b/arch/x86/include/asm/bigsmp/ipi.h @@ -1,22 +1,22 @@ #ifndef __ASM_MACH_IPI_H #define __ASM_MACH_IPI_H -void send_IPI_mask_sequence(const struct cpumask *mask, int vector); -void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); +void default_send_IPI_mask_sequence(const struct cpumask *mask, int vector); +void default_send_IPI_mask_allbutself(const struct cpumask *mask, int vector); -static inline void send_IPI_mask(const struct cpumask *mask, int vector) +static inline void default_send_IPI_mask(const struct cpumask *mask, int vector) { - send_IPI_mask_sequence(mask, vector); + default_send_IPI_mask_sequence(mask, vector); } -static inline void send_IPI_allbutself(int vector) +static inline void bigsmp_send_IPI_allbutself(int vector) { - send_IPI_mask_allbutself(cpu_online_mask, vector); + default_send_IPI_mask_allbutself(cpu_online_mask, vector); } -static inline void send_IPI_all(int vector) +static inline void bigsmp_send_IPI_all(int vector) { - send_IPI_mask(cpu_online_mask, vector); + default_send_IPI_mask(cpu_online_mask, vector); } #endif /* __ASM_MACH_IPI_H */ diff --git a/arch/x86/include/asm/es7000/ipi.h b/arch/x86/include/asm/es7000/ipi.h index 7e8ed24d4b8..81e77c812ba 100644 --- a/arch/x86/include/asm/es7000/ipi.h +++ b/arch/x86/include/asm/es7000/ipi.h @@ -1,22 +1,22 @@ #ifndef __ASM_ES7000_IPI_H #define __ASM_ES7000_IPI_H -void send_IPI_mask_sequence(const struct cpumask *mask, int vector); -void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); +void default_send_IPI_mask_sequence(const struct cpumask *mask, int vector); +void default_send_IPI_mask_allbutself(const struct cpumask *mask, int vector); -static inline void send_IPI_mask(const struct cpumask *mask, int vector) +static inline void es7000_send_IPI_mask(const struct cpumask *mask, int vector) { - send_IPI_mask_sequence(mask, vector); + default_send_IPI_mask_sequence(mask, vector); } -static inline void send_IPI_allbutself(int vector) +static inline void es7000_send_IPI_allbutself(int vector) { - send_IPI_mask_allbutself(cpu_online_mask, vector); + default_send_IPI_mask_allbutself(cpu_online_mask, vector); } -static inline void send_IPI_all(int vector) +static inline void es7000_send_IPI_all(int vector) { - send_IPI_mask(cpu_online_mask, vector); + es7000_send_IPI_mask(cpu_online_mask, vector); } #endif /* __ASM_ES7000_IPI_H */ diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 8de644b6b95..bfa921fad13 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -73,9 +73,9 @@ extern void enable_IO_APIC(void); /* IPI functions */ #ifdef CONFIG_X86_32 -extern void send_IPI_self(int vector); +extern void default_send_IPI_self(int vector); #endif -extern void send_IPI(int dest, int vector); +extern void default_send_IPI(int dest, int vector); /* Statistics */ extern atomic_t irq_err_count; diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index c745a306f7d..a8d717f2c7e 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -55,8 +55,9 @@ static inline void __xapic_wait_icr_idle(void) cpu_relax(); } -static inline void __send_IPI_shortcut(unsigned int shortcut, int vector, - unsigned int dest) +static inline void +__default_send_IPI_shortcut(unsigned int shortcut, + int vector, unsigned int dest) { /* * Subtle. In the case of the 'never do double writes' workaround @@ -87,8 +88,8 @@ static inline void __send_IPI_shortcut(unsigned int shortcut, int vector, * This is used to send an IPI with no shorthand notation (the destination is * specified in bits 56 to 63 of the ICR). */ -static inline void __send_IPI_dest_field(unsigned int mask, int vector, - unsigned int dest) +static inline void + __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) { unsigned long cfg; @@ -117,11 +118,11 @@ static inline void __send_IPI_dest_field(unsigned int mask, int vector, native_apic_mem_write(APIC_ICR, cfg); } -static inline void send_IPI_mask_sequence(const struct cpumask *mask, - int vector) +static inline void +default_send_IPI_mask_sequence(const struct cpumask *mask, int vector) { - unsigned long flags; unsigned long query_cpu; + unsigned long flags; /* * Hack. The clustered APIC addressing mode doesn't allow us to send @@ -130,27 +131,28 @@ static inline void send_IPI_mask_sequence(const struct cpumask *mask, */ local_irq_save(flags); for_each_cpu(query_cpu, mask) { - __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu), - vector, APIC_DEST_PHYSICAL); + __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, + query_cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } -static inline void send_IPI_mask_allbutself(const struct cpumask *mask, - int vector) +static inline void +default_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { - unsigned long flags; - unsigned int query_cpu; unsigned int this_cpu = smp_processor_id(); + unsigned int query_cpu; + unsigned long flags; /* See Hack comment above */ local_irq_save(flags); - for_each_cpu(query_cpu, mask) - if (query_cpu != this_cpu) - __send_IPI_dest_field( - per_cpu(x86_cpu_to_apicid, query_cpu), - vector, APIC_DEST_PHYSICAL); + for_each_cpu(query_cpu, mask) { + if (query_cpu == this_cpu) + continue; + __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, + query_cpu), vector, APIC_DEST_PHYSICAL); + } local_irq_restore(flags); } diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h index 8972f843414..2e4104cf348 100644 --- a/arch/x86/include/asm/mach-default/mach_apic.h +++ b/arch/x86/include/asm/mach-default/mach_apic.h @@ -20,7 +20,6 @@ static inline const struct cpumask *default_target_cpus(void) #ifdef CONFIG_X86_64 #include #define read_apic_id() (apic->get_apic_id(apic_read(APIC_ID))) -#define send_IPI_self (apic->send_IPI_self) #define wakeup_secondary_cpu (apic->wakeup_cpu) extern void default_setup_apic_routing(void); #else diff --git a/arch/x86/include/asm/mach-default/mach_ipi.h b/arch/x86/include/asm/mach-default/mach_ipi.h index 089399643df..85dec630c69 100644 --- a/arch/x86/include/asm/mach-default/mach_ipi.h +++ b/arch/x86/include/asm/mach-default/mach_ipi.h @@ -4,45 +4,40 @@ /* Avoid include hell */ #define NMI_VECTOR 0x02 -void send_IPI_mask_bitmask(const struct cpumask *mask, int vector); -void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); -void __send_IPI_shortcut(unsigned int shortcut, int vector); +void default_send_IPI_mask_bitmask(const struct cpumask *mask, int vector); +void default_send_IPI_mask_allbutself(const struct cpumask *mask, int vector); +void __default_send_IPI_shortcut(unsigned int shortcut, int vector); extern int no_broadcast; #ifdef CONFIG_X86_64 #include -#define send_IPI_mask (apic->send_IPI_mask) -#define send_IPI_mask_allbutself (apic->send_IPI_mask_allbutself) #else -static inline void send_IPI_mask(const struct cpumask *mask, int vector) +static inline void default_send_IPI_mask(const struct cpumask *mask, int vector) { - send_IPI_mask_bitmask(mask, vector); + default_send_IPI_mask_bitmask(mask, vector); } -void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); +void default_send_IPI_mask_allbutself(const struct cpumask *mask, int vector); #endif -static inline void __local_send_IPI_allbutself(int vector) +static inline void __default_local_send_IPI_allbutself(int vector) { if (no_broadcast || vector == NMI_VECTOR) - send_IPI_mask_allbutself(cpu_online_mask, vector); + apic->send_IPI_mask_allbutself(cpu_online_mask, vector); else - __send_IPI_shortcut(APIC_DEST_ALLBUT, vector); + __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector); } -static inline void __local_send_IPI_all(int vector) +static inline void __default_local_send_IPI_all(int vector) { if (no_broadcast || vector == NMI_VECTOR) - send_IPI_mask(cpu_online_mask, vector); + apic->send_IPI_mask(cpu_online_mask, vector); else - __send_IPI_shortcut(APIC_DEST_ALLINC, vector); + __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector); } -#ifdef CONFIG_X86_64 -#define send_IPI_allbutself (apic->send_IPI_allbutself) -#define send_IPI_all (apic->send_IPI_all) -#else -static inline void send_IPI_allbutself(int vector) +#ifdef CONFIG_X86_32 +static inline void default_send_IPI_allbutself(int vector) { /* * if there are no other CPUs in the system then we get an APIC send @@ -51,13 +46,12 @@ static inline void send_IPI_allbutself(int vector) if (!(num_online_cpus() > 1)) return; - __local_send_IPI_allbutself(vector); - return; + __default_local_send_IPI_allbutself(vector); } -static inline void send_IPI_all(int vector) +static inline void default_send_IPI_all(int vector) { - __local_send_IPI_all(vector); + __default_local_send_IPI_all(vector); } #endif diff --git a/arch/x86/include/asm/mach-generic/mach_ipi.h b/arch/x86/include/asm/mach-generic/mach_ipi.h index 75e54bd6cbd..5691c09645c 100644 --- a/arch/x86/include/asm/mach-generic/mach_ipi.h +++ b/arch/x86/include/asm/mach-generic/mach_ipi.h @@ -3,8 +3,4 @@ #include -#define send_IPI_mask (apic->send_IPI_mask) -#define send_IPI_allbutself (apic->send_IPI_allbutself) -#define send_IPI_all (apic->send_IPI_all) - #endif /* _ASM_X86_MACH_GENERIC_MACH_IPI_H */ diff --git a/arch/x86/include/asm/numaq/ipi.h b/arch/x86/include/asm/numaq/ipi.h index a8374c65277..5dbc4b4cd5e 100644 --- a/arch/x86/include/asm/numaq/ipi.h +++ b/arch/x86/include/asm/numaq/ipi.h @@ -1,22 +1,22 @@ #ifndef __ASM_NUMAQ_IPI_H #define __ASM_NUMAQ_IPI_H -void send_IPI_mask_sequence(const struct cpumask *mask, int vector); -void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); +void default_send_IPI_mask_sequence(const struct cpumask *mask, int vector); +void default_send_IPI_mask_allbutself(const struct cpumask *mask, int vector); -static inline void send_IPI_mask(const struct cpumask *mask, int vector) +static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector) { - send_IPI_mask_sequence(mask, vector); + default_send_IPI_mask_sequence(mask, vector); } -static inline void send_IPI_allbutself(int vector) +static inline void numaq_send_IPI_allbutself(int vector) { - send_IPI_mask_allbutself(cpu_online_mask, vector); + default_send_IPI_mask_allbutself(cpu_online_mask, vector); } -static inline void send_IPI_all(int vector) +static inline void numaq_send_IPI_all(int vector) { - send_IPI_mask(cpu_online_mask, vector); + numaq_send_IPI_mask(cpu_online_mask, vector); } #endif /* __ASM_NUMAQ_IPI_H */ diff --git a/arch/x86/include/asm/summit/ipi.h b/arch/x86/include/asm/summit/ipi.h index a8a2c24f50c..f87a43fe0ae 100644 --- a/arch/x86/include/asm/summit/ipi.h +++ b/arch/x86/include/asm/summit/ipi.h @@ -1,26 +1,26 @@ #ifndef __ASM_SUMMIT_IPI_H #define __ASM_SUMMIT_IPI_H -void send_IPI_mask_sequence(const cpumask_t *mask, int vector); -void send_IPI_mask_allbutself(const cpumask_t *mask, int vector); +void default_send_IPI_mask_sequence(const cpumask_t *mask, int vector); +void default_send_IPI_mask_allbutself(const cpumask_t *mask, int vector); -static inline void send_IPI_mask(const cpumask_t *mask, int vector) +static inline void summit_send_IPI_mask(const cpumask_t *mask, int vector) { - send_IPI_mask_sequence(mask, vector); + default_send_IPI_mask_sequence(mask, vector); } -static inline void send_IPI_allbutself(int vector) +static inline void summit_send_IPI_allbutself(int vector) { cpumask_t mask = cpu_online_map; cpu_clear(smp_processor_id(), mask); if (!cpus_empty(mask)) - send_IPI_mask(&mask, vector); + summit_send_IPI_mask(&mask, vector); } -static inline void send_IPI_all(int vector) +static inline void summit_send_IPI_all(int vector) { - send_IPI_mask(&cpu_online_map, vector); + summit_send_IPI_mask(&cpu_online_map, vector); } #endif /* __ASM_SUMMIT_IPI_H */ diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 5f7f3a9a47a..84b8e7c57ab 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -475,7 +475,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, static void lapic_timer_broadcast(const struct cpumask *mask) { #ifdef CONFIG_SMP - send_IPI_mask(mask, LOCAL_TIMER_VECTOR); + apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR); #endif } diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index d57d2138f07..820dea5d0eb 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -65,7 +65,7 @@ void __init default_setup_apic_routing(void) void apic_send_IPI_self(int vector) { - __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); + __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); } int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index b941b112caf..7c648ccea51 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -74,7 +74,7 @@ static inline void _flat_send_IPI_mask(unsigned long mask, int vector) unsigned long flags; local_irq_save(flags); - __send_IPI_dest_field(mask, vector, apic->dest_logical); + __default_send_IPI_dest_field(mask, vector, apic->dest_logical); local_irq_restore(flags); } @@ -85,14 +85,15 @@ static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) _flat_send_IPI_mask(mask, vector); } -static void flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, - int vector) +static void + flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { unsigned long mask = cpumask_bits(cpumask)[0]; int cpu = smp_processor_id(); if (cpu < BITS_PER_LONG) clear_bit(cpu, &mask); + _flat_send_IPI_mask(mask, vector); } @@ -114,16 +115,19 @@ static void flat_send_IPI_allbutself(int vector) _flat_send_IPI_mask(mask, vector); } } else if (num_online_cpus() > 1) { - __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, apic->dest_logical); + __default_send_IPI_shortcut(APIC_DEST_ALLBUT, + vector, apic->dest_logical); } } static void flat_send_IPI_all(int vector) { - if (vector == NMI_VECTOR) + if (vector == NMI_VECTOR) { flat_send_IPI_mask(cpu_online_mask, vector); - else - __send_IPI_shortcut(APIC_DEST_ALLINC, vector, apic->dest_logical); + } else { + __default_send_IPI_shortcut(APIC_DEST_ALLINC, + vector, apic->dest_logical); + } } static unsigned int flat_get_apic_id(unsigned long x) @@ -265,18 +269,18 @@ static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask) static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) { - send_IPI_mask_sequence(cpumask, vector); + default_send_IPI_mask_sequence(cpumask, vector); } static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { - send_IPI_mask_allbutself(cpumask, vector); + default_send_IPI_mask_allbutself(cpumask, vector); } static void physflat_send_IPI_allbutself(int vector) { - send_IPI_mask_allbutself(cpu_online_mask, vector); + default_send_IPI_mask_allbutself(cpu_online_mask, vector); } static void physflat_send_IPI_all(int vector) diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c index 62f9fccf01d..2d97726a973 100644 --- a/arch/x86/kernel/genx2apic_cluster.c +++ b/arch/x86/kernel/genx2apic_cluster.c @@ -36,8 +36,8 @@ static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) cpumask_set_cpu(cpu, retmask); } -static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, - unsigned int dest) +static void + __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) { unsigned long cfg; @@ -57,45 +57,50 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, */ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) { - unsigned long flags; unsigned long query_cpu; + unsigned long flags; local_irq_save(flags); - for_each_cpu(query_cpu, mask) + for_each_cpu(query_cpu, mask) { __x2apic_send_IPI_dest( per_cpu(x86_cpu_to_logical_apicid, query_cpu), vector, apic->dest_logical); + } local_irq_restore(flags); } -static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, - int vector) +static void + x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { - unsigned long flags; - unsigned long query_cpu; unsigned long this_cpu = smp_processor_id(); + unsigned long query_cpu; + unsigned long flags; local_irq_save(flags); - for_each_cpu(query_cpu, mask) - if (query_cpu != this_cpu) - __x2apic_send_IPI_dest( + for_each_cpu(query_cpu, mask) { + if (query_cpu == this_cpu) + continue; + __x2apic_send_IPI_dest( per_cpu(x86_cpu_to_logical_apicid, query_cpu), vector, apic->dest_logical); + } local_irq_restore(flags); } static void x2apic_send_IPI_allbutself(int vector) { - unsigned long flags; - unsigned long query_cpu; unsigned long this_cpu = smp_processor_id(); + unsigned long query_cpu; + unsigned long flags; local_irq_save(flags); - for_each_online_cpu(query_cpu) - if (query_cpu != this_cpu) - __x2apic_send_IPI_dest( + for_each_online_cpu(query_cpu) { + if (query_cpu == this_cpu) + continue; + __x2apic_send_IPI_dest( per_cpu(x86_cpu_to_logical_apicid, query_cpu), vector, apic->dest_logical); + } local_irq_restore(flags); } @@ -175,7 +180,6 @@ static void init_x2apic_ldr(void) int cpu = smp_processor_id(); per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); - return; } struct genapic apic_x2apic_cluster = { diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c index 3da1675b260..74777c27669 100644 --- a/arch/x86/kernel/genx2apic_phys.c +++ b/arch/x86/kernel/genx2apic_phys.c @@ -55,8 +55,8 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) { - unsigned long flags; unsigned long query_cpu; + unsigned long flags; local_irq_save(flags); for_each_cpu(query_cpu, mask) { @@ -66,12 +66,12 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) local_irq_restore(flags); } -static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, - int vector) +static void + x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { - unsigned long flags; - unsigned long query_cpu; unsigned long this_cpu = smp_processor_id(); + unsigned long query_cpu; + unsigned long flags; local_irq_save(flags); for_each_cpu(query_cpu, mask) { @@ -85,16 +85,17 @@ static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, static void x2apic_send_IPI_allbutself(int vector) { - unsigned long flags; - unsigned long query_cpu; unsigned long this_cpu = smp_processor_id(); + unsigned long query_cpu; + unsigned long flags; local_irq_save(flags); - for_each_online_cpu(query_cpu) - if (query_cpu != this_cpu) - __x2apic_send_IPI_dest( - per_cpu(x86_cpu_to_apicid, query_cpu), - vector, APIC_DEST_PHYSICAL); + for_each_online_cpu(query_cpu) { + if (query_cpu == this_cpu) + continue; + __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + } local_irq_restore(flags); } @@ -145,18 +146,12 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, static unsigned int x2apic_phys_get_apic_id(unsigned long x) { - unsigned int id; - - id = x; - return id; + return x; } static unsigned long set_apic_id(unsigned int id) { - unsigned long x; - - x = id; - return x; + return id; } static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) @@ -171,7 +166,6 @@ static void x2apic_send_IPI_self(int vector) static void init_x2apic_ldr(void) { - return; } struct genapic apic_x2apic_phys = { diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index f957878c21e..24b9f42db9b 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -118,12 +118,13 @@ static void uv_send_IPI_one(int cpu, int vector) int pnode; apicid = per_cpu(x86_cpu_to_apicid, cpu); - lapicid = apicid & 0x3f; /* ZZZ macro needed */ + lapicid = apicid & 0x3f; /* ZZZ macro needed */ pnode = uv_apicid_to_pnode(apicid); - val = - (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid << - UVH_IPI_INT_APIC_ID_SHFT) | - (vector << UVH_IPI_INT_VECTOR_SHFT); + + val = ( 1UL << UVH_IPI_INT_SEND_SHFT ) | + ( lapicid << UVH_IPI_INT_APIC_ID_SHFT ) | + ( vector << UVH_IPI_INT_VECTOR_SHFT ); + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } @@ -137,22 +138,24 @@ static void uv_send_IPI_mask(const struct cpumask *mask, int vector) static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { - unsigned int cpu; unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; - for_each_cpu(cpu, mask) + for_each_cpu(cpu, mask) { if (cpu != this_cpu) uv_send_IPI_one(cpu, vector); + } } static void uv_send_IPI_allbutself(int vector) { - unsigned int cpu; unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { if (cpu != this_cpu) uv_send_IPI_one(cpu, vector); + } } static void uv_send_IPI_all(int vector) diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 01a2505d727..e90970ce21a 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -514,11 +514,11 @@ static void send_cleanup_vector(struct irq_cfg *cfg) for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) cfg->move_cleanup_count++; for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); } else { cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); cfg->move_cleanup_count = cpumask_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); free_cpumask_var(cleanup_mask); } cfg->move_in_progress = 0; @@ -800,7 +800,7 @@ static void clear_IO_APIC (void) } #if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) -void send_IPI_self(int vector) +void default_send_IPI_self(int vector) { unsigned int cfg; @@ -2297,7 +2297,7 @@ static int ioapic_retrigger_irq(unsigned int irq) unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); + apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -2305,7 +2305,7 @@ static int ioapic_retrigger_irq(unsigned int irq) #else static int ioapic_retrigger_irq(unsigned int irq) { - send_IPI_self(irq_cfg(irq)->vector); + apic->send_IPI_self(irq_cfg(irq)->vector); return 1; } diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index 367c5e684fa..e16c41b2e4e 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c @@ -48,7 +48,7 @@ static inline int __prepare_ICR2(unsigned int mask) return SET_APIC_DEST_FIELD(mask); } -void __send_IPI_shortcut(unsigned int shortcut, int vector) +void __default_send_IPI_shortcut(unsigned int shortcut, int vector) { /* * Subtle. In the case of the 'never do double writes' workaround @@ -75,16 +75,16 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector) apic_write(APIC_ICR, cfg); } -void send_IPI_self(int vector) +void default_send_IPI_self(int vector) { - __send_IPI_shortcut(APIC_DEST_SELF, vector); + __default_send_IPI_shortcut(APIC_DEST_SELF, vector); } /* * This is used to send an IPI with no shorthand notation (the destination is * specified in bits 56 to 63 of the ICR). */ -static inline void __send_IPI_dest_field(unsigned long mask, int vector) +static inline void __default_send_IPI_dest_field(unsigned long mask, int vector) { unsigned long cfg; @@ -116,18 +116,18 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) /* * This is only used on smaller machines. */ -void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector) +void default_send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector) { unsigned long mask = cpumask_bits(cpumask)[0]; unsigned long flags; local_irq_save(flags); WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); - __send_IPI_dest_field(mask, vector); + __default_send_IPI_dest_field(mask, vector); local_irq_restore(flags); } -void send_IPI_mask_sequence(const struct cpumask *mask, int vector) +void default_send_IPI_mask_sequence(const struct cpumask *mask, int vector) { unsigned long flags; unsigned int query_cpu; @@ -140,11 +140,11 @@ void send_IPI_mask_sequence(const struct cpumask *mask, int vector) local_irq_save(flags); for_each_cpu(query_cpu, mask) - __send_IPI_dest_field(apic->cpu_to_logical_apicid(query_cpu), vector); + __default_send_IPI_dest_field(apic->cpu_to_logical_apicid(query_cpu), vector); local_irq_restore(flags); } -void send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +void default_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { unsigned long flags; unsigned int query_cpu; @@ -153,10 +153,12 @@ void send_IPI_mask_allbutself(const struct cpumask *mask, int vector) /* See Hack comment above */ local_irq_save(flags); - for_each_cpu(query_cpu, mask) - if (query_cpu != this_cpu) - __send_IPI_dest_field(apic->cpu_to_logical_apicid(query_cpu), - vector); + for_each_cpu(query_cpu, mask) { + if (query_cpu == this_cpu) + continue; + __default_send_IPI_dest_field( + apic->cpu_to_logical_apicid(query_cpu), vector); + } local_irq_restore(flags); } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 10435a120d2..b62a3811e01 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -347,7 +347,7 @@ void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code) */ void kgdb_roundup_cpus(unsigned long flags) { - send_IPI_allbutself(APIC_DM_NMI); + apic->send_IPI_allbutself(APIC_DM_NMI); } #endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f8536fee5c1..38dace28d62 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -651,7 +651,7 @@ static int crash_nmi_callback(struct notifier_block *self, static void smp_send_nmi_allbutself(void) { - send_IPI_allbutself(NMI_VECTOR); + apic->send_IPI_allbutself(NMI_VECTOR); } static struct notifier_block crash_nmi_nb = { diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index e6faa3316bd..c48ba6cc32a 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -118,12 +118,12 @@ static void native_smp_send_reschedule(int cpu) WARN_ON(1); return; } - send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); + apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); } void native_send_call_func_single_ipi(int cpu) { - send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); + apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); } void native_send_call_func_ipi(const struct cpumask *mask) @@ -131,7 +131,7 @@ void native_send_call_func_ipi(const struct cpumask *mask) cpumask_var_t allbutself; if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) { - send_IPI_mask(mask, CALL_FUNCTION_VECTOR); + apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR); return; } @@ -140,9 +140,9 @@ void native_send_call_func_ipi(const struct cpumask *mask) if (cpumask_equal(mask, allbutself) && cpumask_equal(cpu_online_mask, cpu_callout_mask)) - send_IPI_allbutself(CALL_FUNCTION_VECTOR); + apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR); else - send_IPI_mask(mask, CALL_FUNCTION_VECTOR); + apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR); free_cpumask_var(allbutself); } diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index 22c2c7b8e4a..4782b554788 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c @@ -97,10 +97,10 @@ struct genapic apic_bigsmp = { .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and, - .send_IPI_mask = send_IPI_mask, + .send_IPI_mask = default_send_IPI_mask, .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = send_IPI_allbutself, - .send_IPI_all = send_IPI_all, + .send_IPI_allbutself = bigsmp_send_IPI_allbutself, + .send_IPI_all = bigsmp_send_IPI_all, .send_IPI_self = NULL, .wakeup_cpu = NULL, diff --git a/arch/x86/mach-generic/default.c b/arch/x86/mach-generic/default.c index 477ebec1674..bf4670db25f 100644 --- a/arch/x86/mach-generic/default.c +++ b/arch/x86/mach-generic/default.c @@ -78,10 +78,10 @@ struct genapic apic_default = { .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, - .send_IPI_mask = send_IPI_mask, + .send_IPI_mask = default_send_IPI_mask, .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = send_IPI_allbutself, - .send_IPI_all = send_IPI_all, + .send_IPI_allbutself = default_send_IPI_allbutself, + .send_IPI_all = default_send_IPI_all, .send_IPI_self = NULL, .wakeup_cpu = NULL, diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index d758cf65d73..d36642e6d90 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c @@ -133,10 +133,10 @@ struct genapic apic_es7000 = { .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, - .send_IPI_mask = send_IPI_mask, + .send_IPI_mask = es7000_send_IPI_mask, .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = send_IPI_allbutself, - .send_IPI_all = send_IPI_all, + .send_IPI_allbutself = es7000_send_IPI_allbutself, + .send_IPI_all = es7000_send_IPI_all, .send_IPI_self = NULL, .wakeup_cpu = NULL, diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c index eb7d56a521d..135b1832ad8 100644 --- a/arch/x86/mach-generic/numaq.c +++ b/arch/x86/mach-generic/numaq.c @@ -97,10 +97,10 @@ struct genapic apic_numaq = { .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and, - .send_IPI_mask = send_IPI_mask, + .send_IPI_mask = numaq_send_IPI_mask, .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = send_IPI_allbutself, - .send_IPI_all = send_IPI_all, + .send_IPI_allbutself = numaq_send_IPI_allbutself, + .send_IPI_all = numaq_send_IPI_all, .send_IPI_self = NULL, .wakeup_cpu = NULL, diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c index 8c293055bdf..77196a4a9d2 100644 --- a/arch/x86/mach-generic/summit.c +++ b/arch/x86/mach-generic/summit.c @@ -77,10 +77,10 @@ struct genapic apic_summit = { .cpu_mask_to_apicid = summit_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and, - .send_IPI_mask = send_IPI_mask, + .send_IPI_mask = summit_send_IPI_mask, .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = send_IPI_allbutself, - .send_IPI_all = send_IPI_all, + .send_IPI_allbutself = summit_send_IPI_allbutself, + .send_IPI_all = summit_send_IPI_all, .send_IPI_self = NULL, .wakeup_cpu = NULL, diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 72a6d4ebe34..6348e114692 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -196,7 +196,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, * We have to send the IPI only to * CPUs affected. */ - send_IPI_mask(to_cpumask(f->flush_cpumask), + apic->send_IPI_mask(to_cpumask(f->flush_cpumask), INVALIDATE_TLB_VECTOR_START + sender); while (!cpumask_empty(to_cpumask(f->flush_cpumask))) -- cgit v1.2.3-70-g09d2 From d53e2f2855f1c7c2725d550c1ae6b26f4d671c50 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 Jan 2009 19:14:52 +0100 Subject: x86, smp: remove mach_ipi.h Move mach_ipi.h definitions into genapic.h. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/genapic.h | 1 + arch/x86/include/asm/ipi.h | 61 +++++++++++++++++++++++- arch/x86/include/asm/mach-default/mach_ipi.h | 58 ---------------------- arch/x86/include/asm/mach-generic/gpio.h | 15 ------ arch/x86/include/asm/mach-generic/mach_ipi.h | 6 --- arch/x86/include/asm/mach-generic/mach_wakecpu.h | 4 -- arch/x86/kernel/apic.c | 2 +- arch/x86/kernel/crash.c | 2 +- arch/x86/kernel/io_apic.c | 1 - arch/x86/kernel/ipi.c | 1 - arch/x86/kernel/kgdb.c | 2 +- arch/x86/kernel/reboot.c | 2 +- arch/x86/kernel/smp.c | 1 - arch/x86/kernel/visws_quirks.c | 2 +- arch/x86/mach-default/setup.c | 2 +- arch/x86/mach-generic/default.c | 2 +- arch/x86/mm/tlb.c | 2 +- 17 files changed, 68 insertions(+), 96 deletions(-) delete mode 100644 arch/x86/include/asm/mach-default/mach_ipi.h delete mode 100644 arch/x86/include/asm/mach-generic/gpio.h delete mode 100644 arch/x86/include/asm/mach-generic/mach_ipi.h delete mode 100644 arch/x86/include/asm/mach-generic/mach_wakecpu.h (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/genapic.h b/arch/x86/include/asm/genapic.h index ccfcd19d5b7..273b99452ae 100644 --- a/arch/x86/include/asm/genapic.h +++ b/arch/x86/include/asm/genapic.h @@ -259,4 +259,5 @@ static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid) } #endif /* CONFIG_X86_LOCAL_APIC */ + #endif /* _ASM_X86_GENAPIC_64_H */ diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index a8d717f2c7e..e2e8e4e0a65 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_IPI_H #define _ASM_X86_IPI_H +#ifdef CONFIG_X86_LOCAL_APIC + /* * Copyright 2004 James Cleverdon, IBM. * Subject to the GNU Public License, v.2 @@ -56,8 +58,7 @@ static inline void __xapic_wait_icr_idle(void) } static inline void -__default_send_IPI_shortcut(unsigned int shortcut, - int vector, unsigned int dest) +__default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest) { /* * Subtle. In the case of the 'never do double writes' workaround @@ -156,4 +157,60 @@ default_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) local_irq_restore(flags); } + +/* Avoid include hell */ +#define NMI_VECTOR 0x02 + +void default_send_IPI_mask_bitmask(const struct cpumask *mask, int vector); +void default_send_IPI_mask_allbutself(const struct cpumask *mask, int vector); + +extern int no_broadcast; + +#ifdef CONFIG_X86_64 +#include +#else +static inline void default_send_IPI_mask(const struct cpumask *mask, int vector) +{ + default_send_IPI_mask_bitmask(mask, vector); +} +void default_send_IPI_mask_allbutself(const struct cpumask *mask, int vector); +#endif + +static inline void __default_local_send_IPI_allbutself(int vector) +{ + if (no_broadcast || vector == NMI_VECTOR) + apic->send_IPI_mask_allbutself(cpu_online_mask, vector); + else + __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, apic->dest_logical); +} + +static inline void __default_local_send_IPI_all(int vector) +{ + if (no_broadcast || vector == NMI_VECTOR) + apic->send_IPI_mask(cpu_online_mask, vector); + else + __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector, apic->dest_logical); +} + +#ifdef CONFIG_X86_32 +static inline void default_send_IPI_allbutself(int vector) +{ + /* + * if there are no other CPUs in the system then we get an APIC send + * error if we try to broadcast, thus avoid sending IPIs in this case. + */ + if (!(num_online_cpus() > 1)) + return; + + __default_local_send_IPI_allbutself(vector); +} + +static inline void default_send_IPI_all(int vector) +{ + __default_local_send_IPI_all(vector); +} +#endif + +#endif + #endif /* _ASM_X86_IPI_H */ diff --git a/arch/x86/include/asm/mach-default/mach_ipi.h b/arch/x86/include/asm/mach-default/mach_ipi.h deleted file mode 100644 index 85dec630c69..00000000000 --- a/arch/x86/include/asm/mach-default/mach_ipi.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef _ASM_X86_MACH_DEFAULT_MACH_IPI_H -#define _ASM_X86_MACH_DEFAULT_MACH_IPI_H - -/* Avoid include hell */ -#define NMI_VECTOR 0x02 - -void default_send_IPI_mask_bitmask(const struct cpumask *mask, int vector); -void default_send_IPI_mask_allbutself(const struct cpumask *mask, int vector); -void __default_send_IPI_shortcut(unsigned int shortcut, int vector); - -extern int no_broadcast; - -#ifdef CONFIG_X86_64 -#include -#else -static inline void default_send_IPI_mask(const struct cpumask *mask, int vector) -{ - default_send_IPI_mask_bitmask(mask, vector); -} -void default_send_IPI_mask_allbutself(const struct cpumask *mask, int vector); -#endif - -static inline void __default_local_send_IPI_allbutself(int vector) -{ - if (no_broadcast || vector == NMI_VECTOR) - apic->send_IPI_mask_allbutself(cpu_online_mask, vector); - else - __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector); -} - -static inline void __default_local_send_IPI_all(int vector) -{ - if (no_broadcast || vector == NMI_VECTOR) - apic->send_IPI_mask(cpu_online_mask, vector); - else - __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector); -} - -#ifdef CONFIG_X86_32 -static inline void default_send_IPI_allbutself(int vector) -{ - /* - * if there are no other CPUs in the system then we get an APIC send - * error if we try to broadcast, thus avoid sending IPIs in this case. - */ - if (!(num_online_cpus() > 1)) - return; - - __default_local_send_IPI_allbutself(vector); -} - -static inline void default_send_IPI_all(int vector) -{ - __default_local_send_IPI_all(vector); -} -#endif - -#endif /* _ASM_X86_MACH_DEFAULT_MACH_IPI_H */ diff --git a/arch/x86/include/asm/mach-generic/gpio.h b/arch/x86/include/asm/mach-generic/gpio.h deleted file mode 100644 index 995c45efdb3..00000000000 --- a/arch/x86/include/asm/mach-generic/gpio.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _ASM_X86_MACH_GENERIC_GPIO_H -#define _ASM_X86_MACH_GENERIC_GPIO_H - -int gpio_request(unsigned gpio, const char *label); -void gpio_free(unsigned gpio); -int gpio_direction_input(unsigned gpio); -int gpio_direction_output(unsigned gpio, int value); -int gpio_get_value(unsigned gpio); -void gpio_set_value(unsigned gpio, int value); -int gpio_to_irq(unsigned gpio); -int irq_to_gpio(unsigned irq); - -#include /* cansleep wrappers */ - -#endif /* _ASM_X86_MACH_GENERIC_GPIO_H */ diff --git a/arch/x86/include/asm/mach-generic/mach_ipi.h b/arch/x86/include/asm/mach-generic/mach_ipi.h deleted file mode 100644 index 5691c09645c..00000000000 --- a/arch/x86/include/asm/mach-generic/mach_ipi.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_X86_MACH_GENERIC_MACH_IPI_H -#define _ASM_X86_MACH_GENERIC_MACH_IPI_H - -#include - -#endif /* _ASM_X86_MACH_GENERIC_MACH_IPI_H */ diff --git a/arch/x86/include/asm/mach-generic/mach_wakecpu.h b/arch/x86/include/asm/mach-generic/mach_wakecpu.h deleted file mode 100644 index 0b884c03a3f..00000000000 --- a/arch/x86/include/asm/mach-generic/mach_wakecpu.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H -#define _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H - -#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */ diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 41a0ba34d6b..81efe86eca8 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -49,7 +49,7 @@ #include #include -#include +#include /* * Sanity check diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 11b93cabdf7..ad7f2a696f4 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -28,7 +28,7 @@ #include #include -#include +#include #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index e0744ea6d0f..241a01d6fd4 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -62,7 +62,6 @@ #include #include -#include #include #define __apicdebuginit(type) static type __init diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index 50076d92fbc..0893fa14458 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c @@ -20,7 +20,6 @@ #ifdef CONFIG_X86_32 #include -#include /* * the following functions deal with sending IPIs between CPUs. diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index b62a3811e01..5c4f5548384 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -46,7 +46,7 @@ #include #include -#include +#include /* * Put the error code here just in case the user cares: diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 38dace28d62..32e8f0af292 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -24,7 +24,7 @@ # include #endif -#include +#include /* * Power off function, if any diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 892e7c389be..0eb32ae9bf1 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -26,7 +26,6 @@ #include #include #include -#include #include /* * Some notes on x86 processor bugs affecting SMP operation: diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 3bd7f47a91b..4fd646e6dd4 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index df167f26562..b65ff0bf730 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -10,7 +10,7 @@ #include #include -#include +#include #ifdef CONFIG_HOTPLUG_CPU #define DEFAULT_SEND_IPI (1) diff --git a/arch/x86/mach-generic/default.c b/arch/x86/mach-generic/default.c index 7d5123e474e..d9d44c8c3db 100644 --- a/arch/x86/mach-generic/default.c +++ b/arch/x86/mach-generic/default.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include static void default_vector_allocation_domain(int cpu, struct cpumask *retmask) { diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6348e114692..14c5af4d11e 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -14,7 +14,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0, }; -#include +#include /* * Smarter SMP flushing macros. * c/o Linus Torvalds. -- cgit v1.2.3-70-g09d2 From 3e5095d15276efd14a45393666b1bb7536bf179f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jan 2009 17:07:08 +0100 Subject: x86: replace CONFIG_X86_SMP with CONFIG_SMP The x86/Voyager subarch used to have this distinction between 'x86 SMP support' and 'Voyager SMP support': config X86_SMP bool depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64) This is a pointless distinction - Voyager can (and already does) use smp_ops to implement various SMP quirks it has - and it can be extended more to cover all the specialities of Voyager. So remove this complication in the Kconfig space. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 7 +------ arch/x86/Kconfig.debug | 2 +- arch/x86/include/asm/entry_arch.h | 2 +- arch/x86/include/asm/hw_irq.h | 2 +- arch/x86/kernel/Makefile | 4 ++-- arch/x86/kernel/apic.c | 2 +- arch/x86/kernel/cpu/addon_cpuid_features.c | 2 +- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/setup.c | 2 +- arch/x86/kernel/tsc.c | 2 +- arch/x86/kernel/vmiclock_32.c | 2 +- arch/x86/mm/Makefile | 2 +- 12 files changed, 13 insertions(+), 18 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8e6413e0e7c..3671506fb8d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -173,11 +173,6 @@ config GENERIC_PENDING_IRQ depends on GENERIC_HARDIRQS && SMP default y -config X86_SMP - bool - depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64) - default y - config USE_GENERIC_SMP_HELPERS def_bool y depends on SMP @@ -203,7 +198,7 @@ config X86_BIOS_REBOOT config X86_TRAMPOLINE bool - depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP) + depends on SMP || (64BIT && ACPI_SLEEP) default y config KTIME_SCALAR diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 28f111461ca..a38dd6064f1 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -83,7 +83,7 @@ config DEBUG_PAGEALLOC config DEBUG_PER_CPU_MAPS bool "Debug access to per_cpu maps" depends on DEBUG_KERNEL - depends on X86_SMP + depends on SMP default n help Say Y to verify that the per_cpu map being accessed has diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index b87b077cc23..854d538ae85 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -9,7 +9,7 @@ * is no hardware IRQ pin equivalent for them, they are triggered * through the ICC by us (IPIs) */ -#ifdef CONFIG_X86_SMP +#ifdef CONFIG_SMP BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index bfa921fad13..41550797396 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -98,7 +98,7 @@ extern asmlinkage void qic_call_function_interrupt(void); extern void smp_apic_timer_interrupt(struct pt_regs *); extern void smp_spurious_interrupt(struct pt_regs *); extern void smp_error_interrupt(struct pt_regs *); -#ifdef CONFIG_X86_SMP +#ifdef CONFIG_SMP extern void smp_reschedule_interrupt(struct pt_regs *); extern void smp_call_function_interrupt(struct pt_regs *); extern void smp_call_function_single_interrupt(struct pt_regs *); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 1bc4cdc797d..dc05a57a474 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -57,8 +57,8 @@ obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o -obj-$(CONFIG_X86_SMP) += smp.o -obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o +obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o ipi.o obj-$(CONFIG_SMP) += setup_percpu.o obj-$(CONFIG_X86_64_SMP) += tsc_sync.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 81efe86eca8..968c817762a 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -1900,7 +1900,7 @@ void __cpuinit generic_processor_info(int apicid, int version) } #endif -#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) +#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; #endif diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 4a48bb40974..e48640cfac0 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -69,7 +69,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) */ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_SMP +#ifdef CONFIG_SMP unsigned int eax, ebx, ecx, edx, sub_index; unsigned int ht_mask_width, core_plus_mask_width; unsigned int core_select_mask, core_level_siblings; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e68bb9e3086..89537f678b2 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -344,7 +344,7 @@ static void c1e_idle(void) void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_SMP +#ifdef CONFIG_SMP if (pm_idle == poll_idle && smp_num_siblings > 1) { printk(KERN_WARNING "WARNING: polling idle and HT enabled," " performance may degrade.\n"); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e645d4793e4..eeb180b1d7a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -588,7 +588,7 @@ early_param("elfcorehdr", setup_elfcorehdr); static int __init default_update_genapic(void) { -#ifdef CONFIG_X86_SMP +#ifdef CONFIG_SMP if (!apic->wakeup_cpu) apic->wakeup_cpu = wakeup_secondary_cpu_via_init; #endif diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 599e5816863..83d53ce5d4c 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -773,7 +773,7 @@ __cpuinit int unsynchronized_tsc(void) if (!cpu_has_tsc || tsc_unstable) return 1; -#ifdef CONFIG_X86_SMP +#ifdef CONFIG_SMP if (apic_is_clustered_box()) return 1; #endif diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index c4c1f9e0940..a4791ef412d 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -256,7 +256,7 @@ void __devinit vmi_time_bsp_init(void) */ clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); local_irq_disable(); -#ifdef CONFIG_X86_SMP +#ifdef CONFIG_SMP /* * XXX handle_percpu_irq only defined for SMP; we need to switch over * to using it, since this is a local interrupt, which each CPU must diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 9f05157220f..2b938a38491 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,7 +1,7 @@ obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o gup.o -obj-$(CONFIG_X86_SMP) += tlb.o +obj-$(CONFIG_SMP) += tlb.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o -- cgit v1.2.3-70-g09d2 From 010060741ad35eacb504414bc6fb9bb575b15f62 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Jan 2009 16:02:12 +0100 Subject: x86: add might_sleep() to do_page_fault() Impact: widen debug checks VirtualBox calls do_page_fault() from an atomic context but runs into a might_sleep() way pas this point, cure that. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8f4b859a04b..eb4d7fe0593 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -888,6 +888,12 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) return; } down_read(&mm->mmap_sem); + } else { + /* + * The above down_read_trylock() might have succeeded in which + * case we'll have missed the might_sleep() from down_read(). + */ + might_sleep(); } vma = find_vma(mm, address); -- cgit v1.2.3-70-g09d2 From 8f47e16348e8e25eedf639092a8a2f10a66aba34 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 31 Jan 2009 02:03:42 +0100 Subject: x86: update copyrights Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 2 +- arch/x86/kernel/io_apic.c | 2 +- arch/x86/kernel/mpparse.c | 2 +- arch/x86/kernel/smp.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/stacktrace.c | 2 +- arch/x86/mm/mmap.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 29e7186b0e8..d6da6dd2f60 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -1,7 +1,7 @@ /* * Local APIC handling, local APIC timers * - * (c) 1999, 2000 Ingo Molnar + * (c) 1999, 2000, 2009 Ingo Molnar * * Fixes * Maciej W. Rozycki : Bits for genuine 82489DX APICs; diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 3378ffb2140..57d60c741e3 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -1,7 +1,7 @@ /* * Intel IO-APIC support for multi-Pentium hosts. * - * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo + * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * * Many thanks to Stig Venaas for trying out countless experimental * patches and reporting/debugging problems patiently! diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index b46ca7d31fe..66ebb823f39 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -3,7 +3,7 @@ * compliant MP-table parsing routines. * * (c) 1995 Alan Cox, Building #3 - * (c) 1998, 1999, 2000 Ingo Molnar + * (c) 1998, 1999, 2000, 2009 Ingo Molnar * (c) 2008 Alexey Starikovskiy */ diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 0eb32ae9bf1..eaaffae31cc 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -2,7 +2,7 @@ * Intel SMP support routines. * * (c) 1995 Alan Cox, Building #3 - * (c) 1998-99, 2000 Ingo Molnar + * (c) 1998-99, 2000, 2009 Ingo Molnar * (c) 2002,2003 Andi Kleen, SuSE Labs. * * i386 and x86_64 integration by Glauber Costa diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 1268a862abb..f40f86fec2f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -2,7 +2,7 @@ * x86 SMP booting functions * * (c) 1995 Alan Cox, Building #3 - * (c) 1998, 1999, 2000 Ingo Molnar + * (c) 1998, 1999, 2000, 2009 Ingo Molnar * Copyright 2001 Andi Kleen, SuSE Labs. * * Much of the core SMP work is based on previous work by Thomas Radke, to diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 10786af9554..f7bddc2e37d 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -1,7 +1,7 @@ /* * Stack trace management functions * - * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar */ #include #include diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 56fe7124fbe..16582960056 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -4,7 +4,7 @@ * Based on code by Ingo Molnar and Andi Kleen, copyrighted * as follows: * - * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * Copyright 2003-2009 Red Hat Inc. * All Rights Reserved. * Copyright 2005 Andi Kleen, SUSE Labs. * Copyright 2007 Jiri Kosina, SUSE Labs. -- cgit v1.2.3-70-g09d2 From 0973a06cde8cc1522fbcf2baacb926f1ee3f4c79 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 4 Feb 2009 15:24:09 -0800 Subject: x86: mm: introduce helper function in fault.c Impact: cleanup Introduce helper function fault_in_kernel_address() to make editors happy. Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/mm/fault.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index eb4d7fe0593..8e9b0f1fd87 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -775,6 +775,15 @@ static inline int access_error(unsigned long error_code, int write, return 0; } +static int fault_in_kernel_space(unsigned long address) +{ +#ifdef CONFIG_X86_32 + return address >= TASK_SIZE; +#else /* !CONFIG_X86_32 */ + return address >= TASK_SIZE64; +#endif /* CONFIG_X86_32 */ +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -817,11 +826,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * (error_code & 4) == 0, and that the fault was not a * protection error (error_code & 9) == 0. */ -#ifdef CONFIG_X86_32 - if (unlikely(address >= TASK_SIZE)) { -#else - if (unlikely(address >= TASK_SIZE64)) { -#endif + if (unlikely(fault_in_kernel_space(address))) { if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && vmalloc_fault(address) >= 0) return; -- cgit v1.2.3-70-g09d2 From 44581a28e805a31661469c4b466b9cd14b36e7b6 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sun, 8 Feb 2009 09:58:40 -0500 Subject: x86: fix abuse of per_cpu_offset Impact: bug fix Don't use per_cpu_offset() to determine if it valid to access a per-cpu variable for a given cpu number. It is not a valid assumption on x86-64 anymore. Use cpu_possible() instead. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 08d140fbc31..deb1c1ab786 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -702,7 +702,7 @@ void __cpuinit numa_set_node(int cpu, int node) } #ifdef CONFIG_DEBUG_PER_CPU_MAPS - if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) { + if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); dump_stack(); return; @@ -790,7 +790,7 @@ int early_cpu_to_node(int cpu) if (early_per_cpu_ptr(x86_cpu_to_node_map)) return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - if (!per_cpu_offset(cpu)) { + if (!cpu_possible(cpu)) { printk(KERN_WARNING "early_cpu_to_node(%d): no per_cpu area!\n", cpu); dump_stack(); -- cgit v1.2.3-70-g09d2 From 7651194fb715b2d57658c05a710408f6b8448951 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 11 Feb 2009 22:26:52 +0530 Subject: x86: mm/init_32.c fix compilation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit arch/x86/mm/init_32.c: In function ‘find_low_pfn_range’: arch/x86/mm/init_32.c:696: warning: format ‘%u’ expects type ‘unsigned int’, but Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2cef0507441..d48f2560364 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -692,7 +692,7 @@ void __init find_low_pfn_range(void) max_pfn = MAXMEM_PFN + highmem_pages; if (highmem_pages + MAXMEM_PFN > max_pfn) { printk(KERN_WARNING "only %luMB highmem pages " - "available, ignoring highmem size of %uMB.\n", + "available, ignoring highmem size of %luMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages)); highmem_pages = 0; -- cgit v1.2.3-70-g09d2 From 3023533de43c5c01c660e1b48d3700b028eb4615 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Feb 2009 13:28:43 +0100 Subject: x86: fix warning in find_low_pfn_range() Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d48f2560364..e77459dd38a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -722,7 +722,7 @@ void __init find_low_pfn_range(void) highmem_pages = 0; #ifdef CONFIG_HIGHMEM if (highmem_pages >= max_pfn) { - printk(KERN_ERR "highmem size specified (%uMB) is " + printk(KERN_ERR "highmem size specified (%luMB) is " "bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); @@ -731,7 +731,7 @@ void __init find_low_pfn_range(void) if (highmem_pages) { if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE){ - printk(KERN_ERR "highmem size %uMB results in " + printk(KERN_ERR "highmem size %luMB results in " "smaller than 64MB lowmem, ignoring it.\n" , pages_to_mb(highmem_pages)); highmem_pages = 0; -- cgit v1.2.3-70-g09d2 From 4769843bc265a9c24584b98709cf39e1df5c1404 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Feb 2009 13:31:41 +0100 Subject: x86, 32-bit: clean up find_low_pfn_range() Impact: cleanup Split find_low_pfn_range() into two functions: - lowmem_pfn_init() - highmem_pfn_init() The former gets called if all of RAM fits into lowmem, otherwise we call highmem_pfn_init(). Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 137 +++++++++++++++++++++++++++++--------------------- 1 file changed, 79 insertions(+), 58 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index e77459dd38a..9d36eb9ebd5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -675,75 +675,96 @@ static int __init parse_highmem(char *arg) } early_param("highmem", parse_highmem); +#define MSG_HIGHMEM_TOO_BIG \ + "highmem size (%luMB) is bigger than pages available (%luMB)!\n" + +#define MSG_LOWMEM_TOO_SMALL \ + "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n" /* - * Determine low and high memory ranges: + * All of RAM fits into lowmem - but if user wants highmem + * artificially via the highmem=x boot parameter then create + * it: */ -void __init find_low_pfn_range(void) +void __init lowmem_pfn_init(void) { - /* it could update max_pfn */ - - /* max_low_pfn is 0, we already have early_res support */ - - max_low_pfn = max_pfn; - if (max_low_pfn > MAXMEM_PFN) { - if (highmem_pages == -1) - highmem_pages = max_pfn - MAXMEM_PFN; - if (highmem_pages + MAXMEM_PFN < max_pfn) - max_pfn = MAXMEM_PFN + highmem_pages; - if (highmem_pages + MAXMEM_PFN > max_pfn) { - printk(KERN_WARNING "only %luMB highmem pages " - "available, ignoring highmem size of %luMB.\n", - pages_to_mb(max_pfn - MAXMEM_PFN), + if (highmem_pages == -1) + highmem_pages = 0; +#ifdef CONFIG_HIGHMEM + if (highmem_pages >= max_pfn) { + printk(KERN_ERR MSG_HIGHMEM_TOO_BIG, + pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); + highmem_pages = 0; + } + if (highmem_pages) { + if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) { + printk(KERN_ERR MSG_LOWMEM_TOO_SMALL, pages_to_mb(highmem_pages)); highmem_pages = 0; } - max_low_pfn = MAXMEM_PFN; + max_low_pfn -= highmem_pages; + } +#else + if (highmem_pages) + printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); +#endif +} + +#define MSG_HIGHMEM_TOO_SMALL \ + "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" + +#define MSG_HIGHMEM_TRIMMED \ + "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n" +/* + * We have more RAM than fits into lowmem - we try to put it into + * highmem, also taking the highmem=x boot parameter into account: + */ +void __init highmem_pfn_init(void) +{ + if (highmem_pages == -1) + highmem_pages = max_pfn - MAXMEM_PFN; + + if (highmem_pages + MAXMEM_PFN < max_pfn) + max_pfn = MAXMEM_PFN + highmem_pages; + + if (highmem_pages + MAXMEM_PFN > max_pfn) { + printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL, + pages_to_mb(max_pfn - MAXMEM_PFN), + pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn = MAXMEM_PFN; #ifndef CONFIG_HIGHMEM - /* Maximum memory usable is what is directly addressable */ - printk(KERN_WARNING "Warning only %ldMB will be used.\n", - MAXMEM>>20); - if (max_pfn > MAX_NONPAE_PFN) - printk(KERN_WARNING - "Use a HIGHMEM64G enabled kernel.\n"); - else - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); - max_pfn = MAXMEM_PFN; + /* Maximum memory usable is what is directly addressable */ + printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); + if (max_pfn > MAX_NONPAE_PFN) + printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); + else + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + max_pfn = MAXMEM_PFN; #else /* !CONFIG_HIGHMEM */ #ifndef CONFIG_HIGHMEM64G - if (max_pfn > MAX_NONPAE_PFN) { - max_pfn = MAX_NONPAE_PFN; - printk(KERN_WARNING "Warning only 4GB will be used." - "Use a HIGHMEM64G enabled kernel.\n"); - } + if (max_pfn > MAX_NONPAE_PFN) { + max_pfn = MAX_NONPAE_PFN; + printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); + } #endif /* !CONFIG_HIGHMEM64G */ #endif /* !CONFIG_HIGHMEM */ - } else { - if (highmem_pages == -1) - highmem_pages = 0; -#ifdef CONFIG_HIGHMEM - if (highmem_pages >= max_pfn) { - printk(KERN_ERR "highmem size specified (%luMB) is " - "bigger than pages available (%luMB)!.\n", - pages_to_mb(highmem_pages), - pages_to_mb(max_pfn)); - highmem_pages = 0; - } - if (highmem_pages) { - if (max_low_pfn - highmem_pages < - 64*1024*1024/PAGE_SIZE){ - printk(KERN_ERR "highmem size %luMB results in " - "smaller than 64MB lowmem, ignoring it.\n" - , pages_to_mb(highmem_pages)); - highmem_pages = 0; - } - max_low_pfn -= highmem_pages; - } -#else - if (highmem_pages) - printk(KERN_ERR "ignoring highmem size on non-highmem" - " kernel!\n"); -#endif - } +} + +/* + * Determine low and high memory ranges: + */ +void __init find_low_pfn_range(void) +{ + /* it could update max_pfn */ + + /* max_low_pfn is 0, we already have early_res support */ + max_low_pfn = max_pfn; + + if (max_low_pfn > MAXMEM_PFN) + highmem_pfn_init(); + else + lowmem_pfn_init(); } #ifndef CONFIG_NEED_MULTIPLE_NODES -- cgit v1.2.3-70-g09d2 From d88316c243e5458a1888edbe0353c4dec6e61c73 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Feb 2009 15:16:03 +0100 Subject: x86, 32-bit: refactor find_low_pfn_range() Impact: cleanup Make the max_low_pfn logic a bit more standard between lowmem_pfn_init() and highmem_pfn_init(). Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9d36eb9ebd5..1a9612499a3 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -687,6 +687,9 @@ early_param("highmem", parse_highmem); */ void __init lowmem_pfn_init(void) { + /* max_low_pfn is 0, we already have early_res support */ + max_low_pfn = max_pfn; + if (highmem_pages == -1) highmem_pages = 0; #ifdef CONFIG_HIGHMEM @@ -720,6 +723,8 @@ void __init lowmem_pfn_init(void) */ void __init highmem_pfn_init(void) { + max_low_pfn = MAXMEM_PFN; + if (highmem_pages == -1) highmem_pages = max_pfn - MAXMEM_PFN; @@ -732,7 +737,6 @@ void __init highmem_pfn_init(void) pages_to_mb(highmem_pages)); highmem_pages = 0; } - max_low_pfn = MAXMEM_PFN; #ifndef CONFIG_HIGHMEM /* Maximum memory usable is what is directly addressable */ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); @@ -758,13 +762,10 @@ void __init find_low_pfn_range(void) { /* it could update max_pfn */ - /* max_low_pfn is 0, we already have early_res support */ - max_low_pfn = max_pfn; - - if (max_low_pfn > MAXMEM_PFN) - highmem_pfn_init(); - else + if (max_pfn <= MAXMEM_PFN) lowmem_pfn_init(); + else + highmem_pfn_init(); } #ifndef CONFIG_NEED_MULTIPLE_NODES -- cgit v1.2.3-70-g09d2 From 7b6aa335ca1a845c2262ec7a595b4521bca0f79d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Feb 2009 13:58:15 +0100 Subject: x86, apic: remove genapic.h Impact: cleanup Remove genapic.h and remove all references to it. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ipi.h | 2 +- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/apic.c | 2 +- arch/x86/kernel/bigsmp_32.c | 2 +- arch/x86/kernel/cpu/addon_cpuid_features.c | 2 +- arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- arch/x86/kernel/cpu/mcheck/p4.c | 2 +- arch/x86/kernel/crash.c | 2 +- arch/x86/kernel/es7000_32.c | 4 ++-- arch/x86/kernel/genapic_64.c | 2 +- arch/x86/kernel/genapic_flat_64.c | 2 +- arch/x86/kernel/genx2apic_cluster.c | 2 +- arch/x86/kernel/genx2apic_phys.c | 2 +- arch/x86/kernel/genx2apic_uv_x.c | 2 +- arch/x86/kernel/io_apic.c | 2 +- arch/x86/kernel/ipi.c | 2 +- arch/x86/kernel/irq.c | 2 +- arch/x86/kernel/irq_32.c | 2 +- arch/x86/kernel/kgdb.c | 2 +- arch/x86/kernel/mpparse.c | 2 +- arch/x86/kernel/nmi.c | 2 +- arch/x86/kernel/numaq_32.c | 4 ++-- arch/x86/kernel/probe_32.c | 8 ++++---- arch/x86/kernel/reboot.c | 2 +- arch/x86/kernel/setup.c | 2 +- arch/x86/kernel/smp.c | 2 +- arch/x86/kernel/smpboot.c | 4 ++-- arch/x86/kernel/summit_32.c | 2 +- arch/x86/kernel/tlb_uv.c | 4 ++-- arch/x86/kernel/uv_irq.c | 2 +- arch/x86/kernel/visws_quirks.c | 4 ++-- arch/x86/kernel/vmi_32.c | 2 +- arch/x86/kernel/vmiclock_32.c | 2 +- arch/x86/lguest/boot.c | 2 +- arch/x86/mm/srat_64.c | 2 +- arch/x86/mm/tlb.c | 2 +- arch/x86/oprofile/nmi_int.c | 2 +- arch/x86/oprofile/op_model_p4.c | 2 +- arch/x86/oprofile/op_model_ppro.c | 2 +- arch/x86/pci/numaq_32.c | 2 +- arch/x86/xen/enlighten.c | 2 +- 45 files changed, 54 insertions(+), 54 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index 5f2efc5d992..3395c680a97 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -123,7 +123,7 @@ extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector); extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, int vector); -#include +#include extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 956c1dee6fb..42814152c94 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index af494bad885..7db03a9b61d 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -36,7 +36,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/bigsmp_32.c b/arch/x86/kernel/bigsmp_32.c index 9eeb714c5de..72f4e534051 100644 --- a/arch/x86/kernel/bigsmp_32.c +++ b/arch/x86/kernel/bigsmp_32.c @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index e48640cfac0..6882a735d9c 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -7,7 +7,7 @@ #include #include -#include +#include struct cpuid_bit { u16 feature; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index ff4d7b9e32e..c94ba9311e6 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -12,7 +12,7 @@ # include #endif -#include +#include #include "cpu.h" diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4b5d13e472d..41f3788ec9b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -26,8 +26,8 @@ #ifdef CONFIG_X86_LOCAL_APIC #include #include -#include -#include +#include +#include #include #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 1f137a87d4b..290f92e2b7c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -24,7 +24,7 @@ #ifdef CONFIG_X86_LOCAL_APIC #include #include -#include +#include #endif static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index e22d6ed26e6..4772e91e824 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 42f090702f0..5e8c79e748a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index f9c92b66dfb..9b60fce09f7 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index ad7f2a696f4..3340cc0f244 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -28,7 +28,7 @@ #include #include -#include +#include #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c index 23f1df4ce18..6cdfb3e4dc3 100644 --- a/arch/x86/kernel/es7000_32.c +++ b/arch/x86/kernel/es7000_32.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include /* @@ -387,7 +387,7 @@ void __init es7000_enable_apic_mode(void) #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index ef788635324..91cae6f6e73 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 36ee760fd13..a7d84763648 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_ACPI diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c index dd6e8d68542..f5e02cffa26 100644 --- a/arch/x86/kernel/genx2apic_cluster.c +++ b/arch/x86/kernel/genx2apic_cluster.c @@ -7,7 +7,7 @@ #include #include -#include +#include #include DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c index eb1486bb002..11eb4cb7ca3 100644 --- a/arch/x86/kernel/genx2apic_phys.c +++ b/arch/x86/kernel/genx2apic_phys.c @@ -7,7 +7,7 @@ #include #include -#include +#include #include static int x2apic_phys; diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 9ae4a92fac8..c1746a198bd 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index a89878e08a4..00e6071cefc 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -62,7 +62,7 @@ #include #include -#include +#include #define __apicdebuginit(type) static type __init diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index 1326272cae4..dbf5445727a 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3957776b193..f13ca1650aa 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 4beb9a13873..e4ac7c80e2d 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -212,7 +212,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) } #ifdef CONFIG_HOTPLUG_CPU -#include +#include /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 5c4f5548384..eedfaebe106 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -46,7 +46,7 @@ #include #include -#include +#include /* * Put the error code here just in case the user cares: diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 20076445319..7f4d2586972 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -29,7 +29,7 @@ #include #include -#include +#include /* * Checksum an MP configuration block. */ diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 48b9ca5e088..bdfad80c3cf 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -11,7 +11,7 @@ * Mikael Pettersson : PM converted to driver model. Disable/enable API. */ -#include +#include #include #include diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index f0f0c2f0596..5a2d75d1fd4 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include #include @@ -301,7 +301,7 @@ int __init get_memcfg_numaq(void) #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/probe_32.c b/arch/x86/kernel/probe_32.c index b3d5d74e522..be0d554984a 100644 --- a/arch/x86/kernel/probe_32.c +++ b/arch/x86/kernel/probe_32.c @@ -17,20 +17,20 @@ #include #include #include -#include +#include #include #include #include #include -#include +#include #include #include #include #include #include #include -#include +#include #include #include @@ -41,7 +41,7 @@ #include #include -#include +#include #ifdef CONFIG_HOTPLUG_CPU #define DEFAULT_SEND_IPI (1) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 32e8f0af292..7c8cd447d5e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -24,7 +24,7 @@ # include #endif -#include +#include /* * Power off function, if any diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 43d964411c0..deaafd2693e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -97,7 +97,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index eaaffae31cc..13f33ea8cca 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include /* * Some notes on x86 processor bugs affecting SMP operation: * diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b5f2b698973..562a9fc3bc3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -60,12 +60,12 @@ #include #include #include -#include +#include #include #include #include -#include +#include #include #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c index 1cf32c325d1..eb31ba276bb 100644 --- a/arch/x86/kernel/summit_32.c +++ b/arch/x86/kernel/summit_32.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index f396e61bcb3..1a7dfa7cb52 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -15,12 +15,12 @@ #include #include #include -#include +#include #include #include #include -#include +#include static struct bau_control **uv_bau_table_bases __read_mostly; static int uv_bau_retry_limit __read_mostly; diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index 75eb5ec5dd2..aeef529917e 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -11,7 +11,7 @@ #include #include -#include +#include #include static void uv_noop(unsigned int irq) diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 4fd646e6dd4..5264fea6c28 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -32,9 +32,9 @@ #include #include -#include +#include -#include +#include #include diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index a1c7b71dc0d..2cc4a90e2cb 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 2a5e0e6a7c0..a4791ef412d 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index bc9893f2c38..f3a5305b8ad 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 15df1baee10..574c8bc95ef 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include int acpi_numa __initdata; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 14c5af4d11e..b641349fe07 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -14,7 +14,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0, }; -#include +#include /* * Smarter SMP flushing macros. * c/o Linus Torvalds. diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index a32a5c7a8ef..202864ad49a 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include "op_counter.h" #include "op_x86_model.h" diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 09a237bc9ef..4c4a51c90bc 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "op_x86_model.h" diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 5ebd8f605d7..e9f80c744cf 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 5601e829c38..8eb295e116f 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e3dd3fb6729..86497d5f44c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -36,7 +36,7 @@ #include #include -#include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From e641f5f525acb163ba71d92de79c9c7366deae03 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Feb 2009 14:02:01 +0100 Subject: x86, apic: remove duplicate asm/apic.h inclusions Impact: cleanup Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ipi.h | 2 -- arch/x86/kernel/acpi/boot.c | 1 - arch/x86/kernel/apic.c | 1 - arch/x86/kernel/cpu/amd.c | 2 -- arch/x86/kernel/cpu/common.c | 6 ++---- arch/x86/kernel/cpu/intel.c | 1 - arch/x86/kernel/crash.c | 2 -- arch/x86/kernel/es7000_32.c | 1 - arch/x86/kernel/irq_32.c | 1 - arch/x86/kernel/numaq_32.c | 1 - arch/x86/kernel/probe_32.c | 4 ---- arch/x86/kernel/reboot.c | 2 -- arch/x86/kernel/setup.c | 1 - arch/x86/kernel/smpboot.c | 1 - arch/x86/kernel/summit_32.c | 1 - arch/x86/kernel/tlb_uv.c | 2 -- arch/x86/kernel/visws_quirks.c | 7 +------ arch/x86/mm/tlb.c | 1 - 18 files changed, 3 insertions(+), 34 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index 3395c680a97..0b7228268a6 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -123,8 +123,6 @@ extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector); extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, int vector); -#include - extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 42814152c94..a18eb7ce223 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 7db03a9b61d..c12823eb55b 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -36,7 +36,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c94ba9311e6..25423a5b80e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -12,8 +12,6 @@ # include #endif -#include - #include "cpu.h" #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 41f3788ec9b..826d5c87627 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -23,11 +23,9 @@ #include #include #include -#ifdef CONFIG_X86_LOCAL_APIC -#include -#include -#include #include + +#ifdef CONFIG_X86_LOCAL_APIC #include #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 290f92e2b7c..7aeef1d327b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -24,7 +24,6 @@ #ifdef CONFIG_X86_LOCAL_APIC #include #include -#include #endif static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 3340cc0f244..ff958248e61 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -28,8 +28,6 @@ #include #include -#include - #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c index 6cdfb3e4dc3..352d870e5d5 100644 --- a/arch/x86/kernel/es7000_32.c +++ b/arch/x86/kernel/es7000_32.c @@ -387,7 +387,6 @@ void __init es7000_enable_apic_mode(void) #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index e4ac7c80e2d..9dc6b2b2427 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -212,7 +212,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) } #ifdef CONFIG_HOTPLUG_CPU -#include /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 5a2d75d1fd4..40400a58845 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -301,7 +301,6 @@ int __init get_memcfg_numaq(void) #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/probe_32.c b/arch/x86/kernel/probe_32.c index be0d554984a..fd1352ac909 100644 --- a/arch/x86/kernel/probe_32.c +++ b/arch/x86/kernel/probe_32.c @@ -23,14 +23,12 @@ #include #include #include -#include #include #include #include #include #include #include -#include #include #include @@ -41,8 +39,6 @@ #include #include -#include - #ifdef CONFIG_HOTPLUG_CPU #define DEFAULT_SEND_IPI (1) #else diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 7c8cd447d5e..1cc18d439bb 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -24,8 +24,6 @@ # include #endif -#include - /* * Power off function, if any */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index deaafd2693e..b2da0b1d15e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -97,7 +97,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 562a9fc3bc3..09e73876a44 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -65,7 +65,6 @@ #include #include -#include #include #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c index eb31ba276bb..577b0bd8e53 100644 --- a/arch/x86/kernel/summit_32.c +++ b/arch/x86/kernel/summit_32.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 1a7dfa7cb52..f04549afcfe 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -20,8 +20,6 @@ #include #include -#include - static struct bau_control **uv_bau_table_bases __read_mostly; static int uv_bau_retry_limit __read_mostly; diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 5264fea6c28..34199d30ff4 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -29,13 +29,10 @@ #include #include #include +#include #include #include -#include - -#include - #include #include @@ -49,8 +46,6 @@ extern int no_broadcast; -#include - char visws_board_type = -1; char visws_board_rev = -1; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index b641349fe07..a654d59e448 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -14,7 +14,6 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0, }; -#include /* * Smarter SMP flushing macros. * c/o Linus Torvalds. -- cgit v1.2.3-70-g09d2 From 11124411aa95827404d6bfdfc14c908e1b54513c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:09 +0900 Subject: x86: convert to the new dynamic percpu allocator Impact: use new dynamic allocator, unified access to static/dynamic percpu memory Convert to the new dynamic percpu allocator. * implement populate_extra_pte() for both 32 and 64 * update setup_per_cpu_areas() to use pcpu_setup_static() * define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() * define config HAVE_DYNAMIC_PER_CPU_AREA Signed-off-by: Tejun Heo --- arch/x86/Kconfig | 3 ++ arch/x86/include/asm/percpu.h | 8 ++++++ arch/x86/include/asm/pgtable.h | 1 + arch/x86/kernel/setup_percpu.c | 62 +++++++++++++++++++++++++++--------------- arch/x86/mm/init_32.c | 10 +++++++ arch/x86/mm/init_64.c | 19 +++++++++++++ 6 files changed, 81 insertions(+), 22 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f760a22f95d..d3f6eadfd4b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -135,6 +135,9 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y +config HAVE_DYNAMIC_PER_CPU_AREA + def_bool y + config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index aee103b26d0..8f1d2fbec1d 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -43,6 +43,14 @@ #else /* ...!ASSEMBLY */ #include +#include + +#define __addr_to_pcpu_ptr(addr) \ + (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ + + (unsigned long)__per_cpu_start) +#define __pcpu_ptr_to_addr(ptr) \ + (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ + - (unsigned long)__per_cpu_start) #ifdef CONFIG_SMP #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6f7c102018b..dd91c2515c6 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -402,6 +402,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); +void populate_extra_pte(unsigned long vaddr); #ifdef CONFIG_X86_32 extern void native_pagetable_setup_start(pgd_t *base); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d992e6cff73..2dce4355821 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -61,38 +61,56 @@ static inline void setup_percpu_segment(int cpu) */ void __init setup_per_cpu_areas(void) { - ssize_t size; - char *ptr; - int cpu; - - /* Copy section for each CPU (we discard the original) */ - size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + ssize_t size = __per_cpu_end - __per_cpu_start; + unsigned int nr_cpu_pages = DIV_ROUND_UP(size, PAGE_SIZE); + static struct page **pages; + size_t pages_size; + unsigned int cpu, i, j; + unsigned long delta; + size_t pcpu_unit_size; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); + pr_info("PERCPU: Allocating %zd bytes for static per cpu data\n", size); - pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); + pages_size = nr_cpu_pages * num_possible_cpus() * sizeof(pages[0]); + pages = alloc_bootmem(pages_size); + j = 0; for_each_possible_cpu(cpu) { + void *ptr; + + for (i = 0; i < nr_cpu_pages; i++) { #ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(size); + ptr = alloc_bootmem_pages(PAGE_SIZE); #else - int node = early_cpu_to_node(cpu); - if (!node_online(node) || !NODE_DATA(node)) { - ptr = alloc_bootmem_pages(size); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d at %016lx\n", - cpu, __pa(ptr)); - } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); - pr_debug("per cpu data for cpu%d on node%d at %016lx\n", - cpu, node, __pa(ptr)); - } + int node = early_cpu_to_node(cpu); + + if (!node_online(node) || !NODE_DATA(node)) { + ptr = alloc_bootmem_pages(PAGE_SIZE); + pr_info("cpu %d has no node %d or node-local " + "memory\n", cpu, node); + pr_debug("per cpu data for cpu%d at %016lx\n", + cpu, __pa(ptr)); + } else { + ptr = alloc_bootmem_pages_node(NODE_DATA(node), + PAGE_SIZE); + pr_debug("per cpu data for cpu%d on node%d " + "at %016lx\n", cpu, node, __pa(ptr)); + } #endif + memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); + pages[j++] = virt_to_page(ptr); + } + } + + pcpu_unit_size = pcpu_setup_static(populate_extra_pte, pages, size); - memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); - per_cpu_offset(cpu) = ptr - __per_cpu_start; + free_bootmem(__pa(pages), pages_size); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) { + per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 00263bf07a8..8b1a0ef7f87 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -137,6 +137,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) return pte_offset_kernel(pmd, 0); } +void __init populate_extra_pte(unsigned long vaddr) +{ + int pgd_idx = pgd_index(vaddr); + int pmd_idx = pmd_index(vaddr); + pmd_t *pmd; + + pmd = one_md_table_init(swapper_pg_dir + pgd_idx); + one_page_table_init(pmd + pmd_idx); +} + static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, unsigned long vaddr, pte_t *lastpte) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e6d36b49025..7f91e2cdc4c 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -223,6 +223,25 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval) set_pte_vaddr_pud(pud_page, vaddr, pteval); } +void __init populate_extra_pte(unsigned long vaddr) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = pgd_offset_k(vaddr); + if (pgd_none(*pgd)) { + pud = (pud_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, pud); + if (pud != pud_offset(pgd, 0)) { + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); + return; + } + } + + set_pte_vaddr_pud((pud_t *)pgd_page_vaddr(*pgd), vaddr, __pte(0)); +} + /* * Create large page table mappings for a range of physical addresses. */ -- cgit v1.2.3-70-g09d2 From 3c3e5694add02e665bbbd0fecfbbdcc0b903097a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Feb 2009 11:46:36 -0500 Subject: x86: check PMD in spurious_fault handler Impact: fix to prevent hard lockup on bad PMD permissions If the PMD does not have the correct permissions for a page access, but the PTE does, the spurious fault handler will mistake the fault as a lazy TLB transaction. This will result in an infinite loop of: fault -> spurious_fault check (pass) -> return to code -> fault This patch adds a check and a warn on if the PTE passes the permissions but the PMD does not. [ Updated: Ingo Molnar suggested using WARN_ONCE with some text ] Signed-off-by: Steven Rostedt --- arch/x86/mm/fault.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c76ef1d701c..278d645d108 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -455,6 +455,7 @@ static int spurious_fault(unsigned long address, pud_t *pud; pmd_t *pmd; pte_t *pte; + int ret; /* Reserved-bit violation or user access to kernel space? */ if (error_code & (PF_USER | PF_RSVD)) @@ -482,7 +483,17 @@ static int spurious_fault(unsigned long address, if (!pte_present(*pte)) return 0; - return spurious_fault_check(error_code, pte); + ret = spurious_fault_check(error_code, pte); + if (!ret) + return 0; + + /* + * Make sure we have permissions in PMD + * If not, then there's a bug in the page tables. + */ + ret = spurious_fault_check(error_code, (pte_t *) pmd); + WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + return ret; } /* -- cgit v1.2.3-70-g09d2 From 7a5714e0186030676d79a7b4b9830c8e45c3b0a1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 17:44:21 +0100 Subject: x86, pat: add large-PAT check to split_large_page() Impact: future-proof the split_large_page() function Linus noticed that split_large_page() is not safe wrt. the PAT bit: it is bit 12 on the 1GB and 2MB page table level (_PAGE_BIT_PAT_LARGE), and it is bit 7 on the 4K page table level (_PAGE_BIT_PAT). Currently it is not a problem because we never set _PAGE_BIT_PAT_LARGE on any of the large-page mappings - but should this happen in the future the split_large_page() would silently lift bit 12 into the lowlevel 4K pte and would start corrupting the physical page frame offset. Not fun. So add a debug warning, to make sure if something ever sets the PAT bit then this function gets updated too. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7be47d1a97e..8253bc97587 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -482,6 +482,13 @@ static int split_large_page(pte_t *kpte, unsigned long address) pbase = (pte_t *)page_address(base); paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); + /* + * If we ever want to utilize the PAT bit, we need to + * update this function to make sure it's converted from + * bit 12 to bit 7 when we cross from the 2MB level to + * the 4K level: + */ + WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE); #ifdef CONFIG_X86_64 if (level == PG_LEVEL_1G) { -- cgit v1.2.3-70-g09d2 From 2d4a71676f4d89418a0d53e60b89e8b804b390b2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 19:56:40 +0100 Subject: x86, mm: fault.c cleanup Impact: cleanup, no code changed Clean up various small details, which can be correctness checked automatically: - tidy up the include file section - eliminate unnecessary includes - introduce show_signal_msg() to clean up code flow - standardize the code flow - standardize comments and other style details - more cleanups, pointed out by checkpatch No code changed on either 32-bit nor 64-bit: arch/x86/mm/fault.o: text data bss dec hex filename 4632 32 24 4688 1250 fault.o.before 4632 32 24 4688 1250 fault.o.after the md5 changed due to a change in a single instruction: 2e8a8241e7f0d69706776a5a26c90bc0 fault.o.before.asm c5c3d36e725586eb74f0e10692f0193e fault.o.after.asm Because a __LINE__ reference in a WARN_ONCE() has changed. On 32-bit a few stack offsets changed - no code size difference nor any functionality difference. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 546 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 331 insertions(+), 215 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e4b9fc5001c..351d679bf97 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1,56 +1,59 @@ /* * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include /* For unblank_screen() */ +#include +#include #include #include -#include /* for max_low_pfn */ -#include -#include #include #include +#include +#include +#include +#include +#include +#include +#include #include +#include #include +#include +#include +#include +#include +#include +#include +#include + +#include -#include -#include -#include -#include -#include #include +#include +#include +#include #include -#include #include +#include /* - * Page fault error code bits - * bit 0 == 0 means no page found, 1 means protection fault - * bit 1 == 0 means read, 1 means write - * bit 2 == 0 means kernel, 1 means user-mode - * bit 3 == 1 means use of reserved bit detected - * bit 4 == 1 means fault was an instruction fetch + * Page fault error code bits: + * + * bit 0 == 0: no page found 1: protection fault + * bit 1 == 0: read access 1: write access + * bit 2 == 0: kernel-mode access 1: user-mode access + * bit 3 == 1: use of reserved bit detected + * bit 4 == 1: fault was an instruction fetch */ -#define PF_PROT (1<<0) -#define PF_WRITE (1<<1) -#define PF_USER (1<<2) -#define PF_RSVD (1<<3) -#define PF_INSTR (1<<4) +enum x86_pf_error_code { + + PF_PROT = 1 << 0, + PF_WRITE = 1 << 1, + PF_USER = 1 << 2, + PF_RSVD = 1 << 3, + PF_INSTR = 1 << 4, +}; static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { @@ -82,23 +85,27 @@ static inline int notify_page_fault(struct pt_regs *regs) } /* - * X86_32 - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. + * Prefetch quirks: + * + * 32-bit mode: + * + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. * - * X86_64 - * Sometimes the CPU reports invalid exceptions on prefetch. - * Check that here and ignore it. + * 64-bit mode: * - * Opcode checker based on code by Richard Brunner + * Sometimes the CPU reports invalid exceptions on prefetch. + * Check that here and ignore it. + * + * Opcode checker based on code by Richard Brunner. */ -static int is_prefetch(struct pt_regs *regs, unsigned long error_code, - unsigned long addr) +static int +is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { + unsigned char *max_instr; unsigned char *instr; int scan_more = 1; int prefetch = 0; - unsigned char *max_instr; /* * If it was a exec (instruction fetch) fault on NX page, then @@ -114,9 +121,9 @@ static int is_prefetch(struct pt_regs *regs, unsigned long error_code, return 0; while (scan_more && instr < max_instr) { - unsigned char opcode; unsigned char instr_hi; unsigned char instr_lo; + unsigned char opcode; if (probe_kernel_address(instr, opcode)) break; @@ -173,15 +180,17 @@ static int is_prefetch(struct pt_regs *regs, unsigned long error_code, return prefetch; } -static void force_sig_info_fault(int si_signo, int si_code, - unsigned long address, struct task_struct *tsk) +static void +force_sig_info_fault(int si_signo, int si_code, unsigned long address, + struct task_struct *tsk) { siginfo_t info; - info.si_signo = si_signo; - info.si_errno = 0; - info.si_code = si_code; - info.si_addr = (void __user *)address; + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + force_sig_info(si_signo, &info, tsk); } @@ -189,6 +198,7 @@ static void force_sig_info_fault(int si_signo, int si_code, static int bad_address(void *p) { unsigned long dummy; + return probe_kernel_address((unsigned long *)p, dummy); } #endif @@ -200,13 +210,14 @@ static void dump_pagetable(unsigned long address) page = read_cr3(); page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; + #ifdef CONFIG_X86_PAE printk("*pdpt = %016Lx ", page); if ((page >> PAGE_SHIFT) < max_low_pfn && page & _PAGE_PRESENT) { page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) - & (PTRS_PER_PMD - 1)]; + & (PTRS_PER_PMD - 1)]; printk(KERN_CONT "*pde = %016Lx ", page); page &= ~_PAGE_NX; } @@ -218,14 +229,15 @@ static void dump_pagetable(unsigned long address) * We must not directly access the pte in the highpte * case if the page table is located in highmem. * And let's rather not kmap-atomic the pte, just in case - * it's allocated already. + * it's allocated already: */ if ((page >> PAGE_SHIFT) < max_low_pfn && (page & _PAGE_PRESENT) && !(page & _PAGE_PSE)) { + page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; + & (PTRS_PER_PTE - 1)]; printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); } @@ -239,26 +251,38 @@ static void dump_pagetable(unsigned long address) pgd = (pgd_t *)read_cr3(); pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); - if (bad_address(pgd)) goto bad; + if (bad_address(pgd)) + goto bad; + printk("PGD %lx ", pgd_val(*pgd)); - if (!pgd_present(*pgd)) goto ret; + + if (!pgd_present(*pgd)) + goto out; pud = pud_offset(pgd, address); - if (bad_address(pud)) goto bad; + if (bad_address(pud)) + goto bad; + printk("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_large(*pud)) - goto ret; + goto out; pmd = pmd_offset(pud, address); - if (bad_address(pmd)) goto bad; + if (bad_address(pmd)) + goto bad; + printk("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; + if (!pmd_present(*pmd) || pmd_large(*pmd)) + goto out; pte = pte_offset_kernel(pmd, address); - if (bad_address(pte)) goto bad; + if (bad_address(pte)) + goto bad; + printk("PTE %lx", pte_val(*pte)); -ret: +out: printk("\n"); return; bad: @@ -285,7 +309,6 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) * and redundant with the set_pmd() on non-PAE. As would * set_pud. */ - pud = pud_offset(pgd, address); pud_k = pud_offset(pgd_k, address); if (!pud_present(*pud_k)) @@ -295,11 +318,14 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) pmd_k = pmd_offset(pud_k, address); if (!pmd_present(*pmd_k)) return NULL; + if (!pmd_present(*pmd)) { set_pmd(pmd, *pmd_k); arch_flush_lazy_mmu_mode(); - } else + } else { BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + } + return pmd_k; } #endif @@ -312,29 +338,37 @@ KERN_ERR "******* Please consider a BIOS update.\n" KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; #endif -/* Workaround for K8 erratum #93 & buggy BIOS. - BIOS SMM functions are required to use a specific workaround - to avoid corruption of the 64bit RIP register on C stepping K8. - A lot of BIOS that didn't get tested properly miss this. - The OS sees this as a page fault with the upper 32bits of RIP cleared. - Try to work around it here. - Note we only handle faults in kernel here. - Does nothing for X86_32 +/* + * Workaround for K8 erratum #93 & buggy BIOS. + * + * BIOS SMM functions are required to use a specific workaround + * to avoid corruption of the 64bit RIP register on C stepping K8. + * + * A lot of BIOS that didn't get tested properly miss this. + * + * The OS sees this as a page fault with the upper 32bits of RIP cleared. + * Try to work around it here. + * + * Note we only handle faults in kernel here. + * Does nothing on 32-bit. */ static int is_errata93(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - static int warned; + static int once; + if (address != regs->ip) return 0; + if ((address >> 32) != 0) return 0; + address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!warned) { + if (!once) { printk(errata93_warning); - warned = 1; + once = 1; } regs->ip = address; return 1; @@ -344,16 +378,17 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) } /* - * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal - * addresses >4GB. We catch this in the page fault handler because these - * addresses are not reachable. Just detect this case and return. Any code + * Work around K8 erratum #100 K8 in compat mode occasionally jumps + * to illegal addresses >4GB. + * + * We catch this in the page fault handler because these addresses + * are not reachable. Just detect this case and return. Any code * segment in LDT is compatibility mode. */ static int is_errata100(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && - (address >> 32)) + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) return 1; #endif return 0; @@ -363,8 +398,9 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG unsigned long nr; + /* - * Pentium F0 0F C7 C8 bug workaround. + * Pentium F0 0F C7 C8 bug workaround: */ if (boot_cpu_data.f00f_bug) { nr = (address - idt_descr.address) >> 3; @@ -378,8 +414,9 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } -static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static void +show_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { #ifdef CONFIG_X86_32 if (!oops_may_print()) @@ -389,12 +426,14 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, #ifdef CONFIG_X86_PAE if (error_code & PF_INSTR) { unsigned int level; + pte_t *pte = lookup_address(address, &level); - if (pte && pte_present(*pte) && !pte_exec(*pte)) + if (pte && pte_present(*pte) && !pte_exec(*pte)) { printk(KERN_CRIT "kernel tried to execute " "NX-protected page - exploit attempt? " "(uid: %d)\n", current_uid()); + } } #endif @@ -403,34 +442,45 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(KERN_CONT "NULL pointer dereference"); else printk(KERN_CONT "paging request"); + printk(KERN_CONT " at %p\n", (void *) address); printk(KERN_ALERT "IP:"); printk_address(regs->ip, 1); + dump_pagetable(address); } #ifdef CONFIG_X86_64 -static noinline void pgtable_bad(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +pgtable_bad(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { - unsigned long flags = oops_begin(); - int sig = SIGKILL; - struct task_struct *tsk = current; + struct task_struct *tsk; + unsigned long flags; + int sig; + + flags = oops_begin(); + tsk = current; + sig = SIGKILL; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + if (__die("Bad pagetable", regs, error_code)) sig = 0; + oops_end(flags, regs, sig); } #endif -static noinline void no_context(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +no_context(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { struct task_struct *tsk = current; unsigned long *stackend; @@ -440,18 +490,20 @@ static noinline void no_context(struct pt_regs *regs, int sig; #endif - /* Are we prepared to handle this kernel fault? */ + /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) return; /* - * X86_32 - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. + * 32-bit: + * + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * 64-bit: * - * X86_64 - * Hall of shame of CPU/BIOS bugs. + * Hall of shame of CPU/BIOS bugs. */ if (is_prefetch(regs, error_code, address)) return; @@ -461,7 +513,7 @@ static noinline void no_context(struct pt_regs *regs, /* * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. + * terminate things with extreme prejudice: */ #ifdef CONFIG_X86_32 bust_spinlocks(1); @@ -471,7 +523,7 @@ static noinline void no_context(struct pt_regs *regs, show_fault_oops(regs, error_code, address); - stackend = end_of_stack(tsk); + stackend = end_of_stack(tsk); if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); @@ -487,28 +539,54 @@ static noinline void no_context(struct pt_regs *regs, sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; + /* Executive summary in case the body of the oops scrolled away */ printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags, regs, sig); #endif } -static void __bad_area_nosemaphore(struct pt_regs *regs, - unsigned long error_code, unsigned long address, - int si_code) +/* + * Print out info about fatal segfaults, if the show_unhandled_signals + * sysctl is set: + */ +static inline void +show_signal_msg(struct pt_regs *regs, unsigned long error_code, + unsigned long address, struct task_struct *tsk) +{ + if (!unhandled_signal(tsk, SIGSEGV)) + return; + + if (!printk_ratelimit()) + return; + + printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *)regs->ip, (void *)regs->sp, error_code); + + print_vma_addr(KERN_CONT " in ", regs->ip); + + printk(KERN_CONT "\n"); +} + +static void +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) { struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ if (error_code & PF_USER) { /* - * It's possible to have interrupts off here. + * It's possible to have interrupts off here: */ local_irq_enable(); /* * Valid to do another page fault here because this one came - * from user space. + * from user space: */ if (is_prefetch(regs, error_code, address)) return; @@ -516,22 +594,16 @@ static void __bad_area_nosemaphore(struct pt_regs *regs, if (is_errata100(regs, address)) return; - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, - (void *) regs->ip, (void *) regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } + if (unlikely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); + + /* Kernel addresses are always protection faults: */ + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; force_sig_info_fault(SIGSEGV, si_code, address, tsk); + return; } @@ -541,15 +613,16 @@ static void __bad_area_nosemaphore(struct pt_regs *regs, no_context(regs, error_code, address); } -static noinline void bad_area_nosemaphore(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); } -static void __bad_area(struct pt_regs *regs, - unsigned long error_code, unsigned long address, - int si_code) +static void +__bad_area(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) { struct mm_struct *mm = current->mm; @@ -562,67 +635,77 @@ static void __bad_area(struct pt_regs *regs, __bad_area_nosemaphore(regs, error_code, address, si_code); } -static noinline void bad_area(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) { __bad_area(regs, error_code, address, SEGV_MAPERR); } -static noinline void bad_area_access_error(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area_access_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { __bad_area(regs, error_code, address, SEGV_ACCERR); } /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ -static void out_of_memory(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static void +out_of_memory(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { /* * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed). + * (which will retry the fault, or kill us if we got oom-killed): */ up_read(¤t->mm->mmap_sem); + pagefault_out_of_memory(); } -static void do_sigbus(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static void +do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; up_read(&mm->mmap_sem); - /* Kernel mode? Handle exceptions or die */ + /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) no_context(regs, error_code, address); + #ifdef CONFIG_X86_32 - /* User space => ok to do another page fault */ + /* User space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) return; #endif - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } -static noinline void mm_fault_error(struct pt_regs *regs, - unsigned long error_code, unsigned long address, unsigned int fault) +static noinline void +mm_fault_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address, unsigned int fault) { - if (fault & VM_FAULT_OOM) + if (fault & VM_FAULT_OOM) { out_of_memory(regs, error_code, address); - else if (fault & VM_FAULT_SIGBUS) - do_sigbus(regs, error_code, address); - else - BUG(); + } else { + if (fault & VM_FAULT_SIGBUS) + do_sigbus(regs, error_code, address); + else + BUG(); + } } static int spurious_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & PF_WRITE) && !pte_write(*pte)) return 0; + if ((error_code & PF_INSTR) && !pte_exec(*pte)) return 0; @@ -630,16 +713,19 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) } /* - * Handle a spurious fault caused by a stale TLB entry. This allows - * us to lazily refresh the TLB when increasing the permissions of a - * kernel page (RO -> RW or NX -> X). Doing it eagerly is very - * expensive since that implies doing a full cross-processor TLB - * flush, even if no stale TLB entries exist on other processors. + * Handle a spurious fault caused by a stale TLB entry. + * + * This allows us to lazily refresh the TLB when increasing the + * permissions of a kernel page (RO -> RW or NX -> X). Doing it + * eagerly is very expensive since that implies doing a full + * cross-processor TLB flush, even if no stale TLB entries exist + * on other processors. + * * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static noinline int spurious_fault(unsigned long error_code, - unsigned long address) +static noinline int +spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; pud_t *pud; @@ -678,20 +764,23 @@ static noinline int spurious_fault(unsigned long error_code, return 0; /* - * Make sure we have permissions in PMD - * If not, then there's a bug in the page tables. + * Make sure we have permissions in PMD. + * If not, then there's a bug in the page tables: */ ret = spurious_fault_check(error_code, (pte_t *) pmd); WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + return ret; } /* - * X86_32 - * Handle a fault on the vmalloc or module mapping area + * 32-bit: + * + * Handle a fault on the vmalloc or module mapping area * - * X86_64 - * Handle a fault on the vmalloc area + * 64-bit: + * + * Handle a fault on the vmalloc area * * This assumes no large pages in there. */ @@ -702,7 +791,7 @@ static noinline int vmalloc_fault(unsigned long address) pmd_t *pmd_k; pte_t *pte_k; - /* Make sure we are in vmalloc area */ + /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; @@ -717,9 +806,11 @@ static noinline int vmalloc_fault(unsigned long address) pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); if (!pmd_k) return -1; + pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) return -1; + return 0; #else pgd_t *pgd, *pgd_ref; @@ -727,69 +818,84 @@ static noinline int vmalloc_fault(unsigned long address) pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; - /* Make sure we are in vmalloc area */ + /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; - /* Copy kernel mappings over when needed. This can also - happen within a race in page table update. In the later - case just flush. */ - + /* + * Copy kernel mappings over when needed. This can also + * happen within a race in page table update. In the later + * case just flush: + */ pgd = pgd_offset(current->active_mm, address); pgd_ref = pgd_offset_k(address); if (pgd_none(*pgd_ref)) return -1; + if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); else BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - /* Below here mismatches are bugs because these lower tables - are shared */ + /* + * Below here mismatches are bugs because these lower tables + * are shared: + */ pud = pud_offset(pgd, address); pud_ref = pud_offset(pgd_ref, address); if (pud_none(*pud_ref)) return -1; + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) BUG(); + pmd = pmd_offset(pud, address); pmd_ref = pmd_offset(pud_ref, address); if (pmd_none(*pmd_ref)) return -1; + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) BUG(); + pte_ref = pte_offset_kernel(pmd_ref, address); if (!pte_present(*pte_ref)) return -1; + pte = pte_offset_kernel(pmd, address); - /* Don't use pte_page here, because the mappings can point - outside mem_map, and the NUMA hash lookup cannot handle - that. */ + + /* + * Don't use pte_page here, because the mappings can point + * outside mem_map, and the NUMA hash lookup cannot handle + * that: + */ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) BUG(); + return 0; #endif } int show_unhandled_signals = 1; -static inline int access_error(unsigned long error_code, int write, - struct vm_area_struct *vma) +static inline int +access_error(unsigned long error_code, int write, struct vm_area_struct *vma) { if (write) { - /* write, present and write, not present */ + /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; - } else if (unlikely(error_code & PF_PROT)) { - /* read, present */ - return 1; - } else { - /* read, not present */ - if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) - return 1; + return 0; } + /* read, present: */ + if (unlikely(error_code & PF_PROT)) + return 1; + + /* read, not present: */ + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return 1; + return 0; } @@ -797,9 +903,9 @@ static int fault_in_kernel_space(unsigned long address) { #ifdef CONFIG_X86_32 return address >= TASK_SIZE; -#else /* !CONFIG_X86_32 */ +#else return address >= TASK_SIZE64; -#endif /* CONFIG_X86_32 */ +#endif } /* @@ -812,18 +918,19 @@ asmlinkage #endif void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) { - unsigned long address; + struct vm_area_struct *vma; struct task_struct *tsk; + unsigned long address; struct mm_struct *mm; - struct vm_area_struct *vma; int write; int fault; tsk = current; mm = tsk->mm; + prefetchw(&mm->mmap_sem); - /* get the address */ + /* Get the faulting address: */ address = read_cr2(); if (unlikely(kmmio_fault(regs, address))) @@ -847,22 +954,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) vmalloc_fault(address) >= 0) return; - /* Can handle a stale RO->RW TLB */ + /* Can handle a stale RO->RW TLB: */ if (spurious_fault(error_code, address)) return; - /* kprobes don't want to hook the spurious faults. */ + /* kprobes don't want to hook the spurious faults: */ if (notify_page_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. + * fault we could otherwise deadlock: */ bad_area_nosemaphore(regs, error_code, address); + return; } - /* kprobes don't want to hook the spurious faults. */ + /* kprobes don't want to hook the spurious faults: */ if (unlikely(notify_page_fault(regs))) return; /* @@ -870,13 +978,15 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * vmalloc fault has been handled. * * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. + * potential system fault or CPU buglet: */ if (user_mode_vm(regs)) { local_irq_enable(); error_code |= PF_USER; - } else if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + } else { + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); + } #ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) @@ -884,8 +994,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) #endif /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault. + * If we're in an interrupt, have no user context or are running + * in an atomic region then we must not take the fault: */ if (unlikely(in_atomic() || !mm)) { bad_area_nosemaphore(regs, error_code, address); @@ -894,19 +1004,19 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* * When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. + * addresses in user space. All other faults represent errors in + * the kernel and should generate an OOPS. Unfortunately, in the + * case of an erroneous fault occurring in a code path which already + * holds mmap_sem we will deadlock attempting to validate the fault + * against the address space. Luckily the kernel only validly + * references user space from well defined areas of code, which are + * listed in the exceptions table. * * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. + * the source reference check when there is a possibility of a + * deadlock. Attempt to lock the address space, if we cannot we then + * validate the source. If this is invalid we can skip the address + * space check, thus avoiding the deadlock: */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && @@ -917,8 +1027,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) down_read(&mm->mmap_sem); } else { /* - * The above down_read_trylock() might have succeeded in which - * case we'll have missed the might_sleep() from down_read(). + * The above down_read_trylock() might have succeeded in + * which case we'll have missed the might_sleep() from + * down_read(): */ might_sleep(); } @@ -938,7 +1049,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535,$31" pushes + * and pusha to work. ("enter $65535, $31" pushes * 32 pointers and then decrements %sp by 65535.) */ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { @@ -957,6 +1068,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) */ good_area: write = error_code & PF_WRITE; + if (unlikely(access_error(error_code, write, vma))) { bad_area_access_error(regs, error_code, address); return; @@ -965,13 +1077,15 @@ good_area: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo - * the fault. + * the fault: */ fault = handle_mm_fault(mm, vma, address, write); + if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } + if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; else @@ -1004,13 +1118,13 @@ void vmalloc_sync_all(void) for (address = VMALLOC_START & PMD_MASK; address >= TASK_SIZE && address < FIXADDR_TOP; address += PMD_SIZE) { + unsigned long flags; struct page *page; spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), - address)) + if (!vmalloc_sync_one(page_address(page), address)) break; } spin_unlock_irqrestore(&pgd_lock, flags); @@ -1018,12 +1132,14 @@ void vmalloc_sync_all(void) #else /* CONFIG_X86_64 */ for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); unsigned long flags; struct page *page; if (pgd_none(*pgd_ref)) continue; + spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; -- cgit v1.2.3-70-g09d2 From 107a03678cac0dd6cf7095f81442a4fa477e4700 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 20:37:05 +0100 Subject: x86, mm: fault.c, refactor/simplify the is_prefetch() code Impact: no functionality changed Factor out the opcode checker into a helper inline. The code got a tiny bit smaller: text data bss dec hex filename 4632 32 24 4688 1250 fault.o.before 4618 32 24 4674 1242 fault.o.after And it got cleaner / easier to review as well. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 100 ++++++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 50 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 351d679bf97..72973c7682b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -99,12 +99,58 @@ static inline int notify_page_fault(struct pt_regs *regs) * * Opcode checker based on code by Richard Brunner. */ +static inline int +check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, + unsigned char opcode, int *prefetch) +{ + unsigned char instr_hi = opcode & 0xf0; + unsigned char instr_lo = opcode & 0x0f; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. + * In X86_64 long mode, the CPU will signal invalid + * opcode if some of these prefixes are present so + * X86_64 will never get here anyway + */ + return ((instr_lo & 7) == 0x6); +#ifdef CONFIG_X86_64 + case 0x40: + /* + * In AMD64 long mode 0x40..0x4F are valid REX prefixes + * Need to figure out under what instruction mode the + * instruction was issued. Could check the LDT for lm, + * but for now it's good enough to assume that long + * mode only uses well known segments or kernel. + */ + return (!user_mode(regs)) || (regs->cs == __USER_CS); +#endif + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + return (instr_lo & 0xC) == 0x4; + case 0xF0: + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ + return !instr_lo || (instr_lo>>1) == 1; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + if (probe_kernel_address(instr, opcode)) + return 0; + + *prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + return 0; + default: + return 0; + } +} + static int is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { unsigned char *max_instr; unsigned char *instr; - int scan_more = 1; int prefetch = 0; /* @@ -114,68 +160,22 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) if (error_code & PF_INSTR) return 0; - instr = (unsigned char *)convert_ip_to_linear(current, regs); + instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) return 0; - while (scan_more && instr < max_instr) { - unsigned char instr_hi; - unsigned char instr_lo; + while (instr < max_instr) { unsigned char opcode; if (probe_kernel_address(instr, opcode)) break; - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; instr++; - switch (instr_hi) { - case 0x20: - case 0x30: - /* - * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. - * In X86_64 long mode, the CPU will signal invalid - * opcode if some of these prefixes are present so - * X86_64 will never get here anyway - */ - scan_more = ((instr_lo & 7) == 0x6); - break; -#ifdef CONFIG_X86_64 - case 0x40: - /* - * In AMD64 long mode 0x40..0x4F are valid REX prefixes - * Need to figure out under what instruction mode the - * instruction was issued. Could check the LDT for lm, - * but for now it's good enough to assume that long - * mode only uses well known segments or kernel. - */ - scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); - break; -#endif - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; + if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; - case 0xF0: - /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - - if (probe_kernel_address(instr, opcode)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } } return prefetch; } -- cgit v1.2.3-70-g09d2 From 8c938f9fae887f6e180bf802aa1c33cf74712aff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:12:18 +0100 Subject: x86, mm: fault.c, factor out the vm86 fault check Impact: cleanup Instead of an ugly, open-coded, #ifdef-ed vm86 related legacy check in do_page_fault(), put it into the check_v8086_mode() helper function and merge it with an existing #ifdef. Also, simplify the code flow a tiny bit in the helper. No code changed: arch/x86/mm/fault.o: text data bss dec hex filename 2711 12 12 2735 aaf fault.o.before 2711 12 12 2735 aaf fault.o.after Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 72973c7682b..7dc0615c3cf 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -328,14 +328,41 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } -#endif -#ifdef CONFIG_X86_64 +/* + * Did it hit the DOS screen memory VA from vm86 mode? + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ + unsigned long bit; + + if (!v8086_mode(regs)) + return; + + bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; +} + +#else /* CONFIG_X86_64: */ + static const char errata93_warning[] = KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" KERN_ERR "******* Please consider a BIOS update.\n" KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; + +/* + * No vm86 mode in 64-bit mode: + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ +} + #endif /* @@ -1091,16 +1118,8 @@ good_area: else tsk->min_flt++; -#ifdef CONFIG_X86_32 - /* - * Did it hit the DOS screen memory VA from vm86 mode? - */ - if (v8086_mode(regs)) { - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; - } -#endif + check_v8086_mode(regs, address, tsk); + up_read(&mm->mmap_sem); } -- cgit v1.2.3-70-g09d2 From 121d5d0a7e5808fbcfda484efd7ba840ac93450f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:18:08 +0100 Subject: x86, mm: fault.c, enable PF_RSVD checks on 32-bit too Impact: improve page fault handling robustness The 'PF_RSVD' flag (bit 3) of the page-fault error_code is a relatively recent addition to x86 CPUs, so the 32-bit do_fault() implementation never had it. This flag gets set when the CPU detects nonzero values in any reserved bits of the page directory entries. Extend the existing 64-bit check for PF_RSVD in do_page_fault() to 32-bit too. If we detect such a fault then we print a more informative oops and the pagetables. This unifies the code some more, removes an ugly #ifdef and improves the 32-bit page fault code robustness a bit. It slightly increases the 32-bit kernel text size. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7dc0615c3cf..3e366146273 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -477,7 +477,6 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, dump_pagetable(address); } -#ifdef CONFIG_X86_64 static noinline void pgtable_bad(struct pt_regs *regs, unsigned long error_code, unsigned long address) @@ -503,7 +502,6 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, oops_end(flags, regs, sig); } -#endif static noinline void no_context(struct pt_regs *regs, unsigned long error_code, @@ -1015,10 +1013,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) local_irq_enable(); } -#ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); -#endif /* * If we're in an interrupt, have no user context or are running -- cgit v1.2.3-70-g09d2 From b814d41f0987c7648d7ed07471258101c95c026b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:32:10 +0100 Subject: x86, mm: fault.c, simplify kmmio_fault() Impact: cleanup Remove an #ifdef from kmmio_fault() - we can do this by providing default implementations for is_kmmio_active() and kmmio_handler(). The compiler optimizes it all away in the !CONFIG_MMIOTRACE case. Also, while at it, clean up mmiotrace.h a bit: - standard header guards - standard vertical spaces for structure definitions No code changed (both with mmiotrace on and off in the config): text data bss dec hex filename 2947 12 12 2971 b9b fault.o.before 2947 12 12 2971 b9b fault.o.after Cc: Pekka Paalanen Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 5 +-- include/linux/mmiotrace.h | 78 ++++++++++++++++++++++++++++------------------- 2 files changed, 50 insertions(+), 33 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3e366146273..fe99af4b86d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -55,13 +55,14 @@ enum x86_pf_error_code { PF_INSTR = 1 << 4, }; +/* + * (returns 0 if mmiotrace is disabled) + */ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -#ifdef CONFIG_MMIOTRACE if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; -#endif return 0; } diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 139d7c88d9c..3d1b7bde128 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -1,5 +1,5 @@ -#ifndef MMIOTRACE_H -#define MMIOTRACE_H +#ifndef _LINUX_MMIOTRACE_H +#define _LINUX_MMIOTRACE_H #include #include @@ -13,28 +13,34 @@ typedef void (*kmmio_post_handler_t)(struct kmmio_probe *, unsigned long condition, struct pt_regs *); struct kmmio_probe { - struct list_head list; /* kmmio internal list */ - unsigned long addr; /* start location of the probe point */ - unsigned long len; /* length of the probe region */ - kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */ - kmmio_post_handler_t post_handler; /* Called after addr is executed */ - void *private; + /* kmmio internal list: */ + struct list_head list; + /* start location of the probe point: */ + unsigned long addr; + /* length of the probe region: */ + unsigned long len; + /* Called before addr is executed: */ + kmmio_pre_handler_t pre_handler; + /* Called after addr is executed: */ + kmmio_post_handler_t post_handler; + void *private; }; +extern unsigned int kmmio_count; + +extern int register_kmmio_probe(struct kmmio_probe *p); +extern void unregister_kmmio_probe(struct kmmio_probe *p); + +#ifdef CONFIG_MMIOTRACE /* kmmio is active by some kmmio_probes? */ static inline int is_kmmio_active(void) { - extern unsigned int kmmio_count; return kmmio_count; } -extern int register_kmmio_probe(struct kmmio_probe *p); -extern void unregister_kmmio_probe(struct kmmio_probe *p); - /* Called from page fault handler. */ extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); -#ifdef CONFIG_MMIOTRACE /* Called from ioremap.c */ extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size, void __iomem *addr); @@ -43,7 +49,17 @@ extern void mmiotrace_iounmap(volatile void __iomem *addr); /* For anyone to insert markers. Remember trailing newline. */ extern int mmiotrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); -#else +#else /* !CONFIG_MMIOTRACE: */ +static inline int is_kmmio_active(void) +{ + return 0; +} + +static inline int kmmio_handler(struct pt_regs *regs, unsigned long addr) +{ + return 0; +} + static inline void mmiotrace_ioremap(resource_size_t offset, unsigned long size, void __iomem *addr) { @@ -63,28 +79,28 @@ static inline int mmiotrace_printk(const char *fmt, ...) #endif /* CONFIG_MMIOTRACE */ enum mm_io_opcode { - MMIO_READ = 0x1, /* struct mmiotrace_rw */ - MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ - MMIO_PROBE = 0x3, /* struct mmiotrace_map */ - MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ - MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */ + MMIO_READ = 0x1, /* struct mmiotrace_rw */ + MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ + MMIO_PROBE = 0x3, /* struct mmiotrace_map */ + MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ + MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */ }; struct mmiotrace_rw { - resource_size_t phys; /* PCI address of register */ - unsigned long value; - unsigned long pc; /* optional program counter */ - int map_id; - unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */ - unsigned char width; /* size of register access in bytes */ + resource_size_t phys; /* PCI address of register */ + unsigned long value; + unsigned long pc; /* optional program counter */ + int map_id; + unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */ + unsigned char width; /* size of register access in bytes */ }; struct mmiotrace_map { - resource_size_t phys; /* base address in PCI space */ - unsigned long virt; /* base virtual address */ - unsigned long len; /* mapping size */ - int map_id; - unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */ + resource_size_t phys; /* base address in PCI space */ + unsigned long virt; /* base virtual address */ + unsigned long len; /* mapping size */ + int map_id; + unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */ }; /* in kernel/trace/trace_mmiotrace.c */ @@ -94,4 +110,4 @@ extern void mmio_trace_rw(struct mmiotrace_rw *rw); extern void mmio_trace_mapping(struct mmiotrace_map *map); extern int mmio_trace_printk(const char *fmt, va_list args); -#endif /* MMIOTRACE_H */ +#endif /* _LINUX_MMIOTRACE_H */ -- cgit v1.2.3-70-g09d2 From b18018126f422f5b706fd750373425e10e84b486 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:42:57 +0100 Subject: x86, mm, kprobes: fault.c, simplify notify_page_fault() Impact: cleanup Remove an #ifdef from notify_page_fault(). The function still compiles to nothing in the !CONFIG_KPROBES case. Introduce kprobes_built_in() and kprobe_fault_handler() helpers to allow this - they returns 0 if !CONFIG_KPROBES. No code changed: text data bss dec hex filename 4618 32 24 4674 1242 fault.o.before 4618 32 24 4674 1242 fault.o.after Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 6 +----- include/linux/kprobes.h | 22 +++++++++++++++++++--- 2 files changed, 20 insertions(+), 8 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fe99af4b86d..379beaec6ca 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -68,11 +68,10 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) static inline int notify_page_fault(struct pt_regs *regs) { -#ifdef CONFIG_KPROBES int ret = 0; /* kprobe_running() needs smp_processor_id() */ - if (!user_mode_vm(regs)) { + if (kprobes_built_in() && !user_mode_vm(regs)) { preempt_disable(); if (kprobe_running() && kprobe_fault_handler(regs, 14)) ret = 1; @@ -80,9 +79,6 @@ static inline int notify_page_fault(struct pt_regs *regs) } return ret; -#else - return 0; -#endif } /* diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 32851eef48f..2ec6cc14a11 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -182,6 +182,14 @@ struct kprobe_blackpoint { DECLARE_PER_CPU(struct kprobe *, current_kprobe); DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); +/* + * For #ifdef avoidance: + */ +static inline int kprobes_built_in(void) +{ + return 1; +} + #ifdef CONFIG_KRETPROBES extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs); @@ -271,8 +279,16 @@ void unregister_kretprobes(struct kretprobe **rps, int num); void kprobe_flush_task(struct task_struct *tk); void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); -#else /* CONFIG_KPROBES */ +#else /* !CONFIG_KPROBES: */ +static inline int kprobes_built_in(void) +{ + return 0; +} +static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + return 0; +} static inline struct kprobe *get_kprobe(void *addr) { return NULL; @@ -329,5 +345,5 @@ static inline void unregister_kretprobes(struct kretprobe **rps, int num) static inline void kprobe_flush_task(struct task_struct *tk) { } -#endif /* CONFIG_KPROBES */ -#endif /* _LINUX_KPROBES_H */ +#endif /* CONFIG_KPROBES */ +#endif /* _LINUX_KPROBES_H */ -- cgit v1.2.3-70-g09d2 From f2f13a8535174dbb813a0607a9d4737cfba98f6c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:50:24 +0100 Subject: x86, mm: fault.c, reorder functions Impact: cleanup Avoid a couple more #ifdefs by moving fundamentally non-unifiable functions into a single #ifdef 32-bit / #else / #endif block in fault.c: vmalloc*(), dump_pagetable(), check_vm8086_mode(). No code changed: text data bss dec hex filename 4618 32 24 4674 1242 fault.o.before 4618 32 24 4674 1242 fault.o.after Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 474 ++++++++++++++++++++++++++-------------------------- 1 file changed, 239 insertions(+), 235 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 379beaec6ca..4ce62fb80da 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -191,18 +191,124 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, force_sig_info(si_signo, &info, tsk); } -#ifdef CONFIG_X86_64 -static int bad_address(void *p) +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +#ifdef CONFIG_X86_32 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { - unsigned long dummy; + unsigned index = pgd_index(address); + pgd_t *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; - return probe_kernel_address((unsigned long *)p, dummy); + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + return NULL; + + if (!pmd_present(*pmd)) { + set_pmd(pmd, *pmd_k); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + } + + return pmd_k; +} + +void vmalloc_sync_all(void) +{ + unsigned long address; + + if (SHARED_KERNEL_PMD) + return; + + for (address = VMALLOC_START & PMD_MASK; + address >= TASK_SIZE && address < FIXADDR_TOP; + address += PMD_SIZE) { + + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + if (!vmalloc_sync_one(page_address(page), address)) + break; + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* + * 32-bit: + * + * Handle a fault on the vmalloc or module mapping area + */ +static noinline int vmalloc_fault(unsigned long address) +{ + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + + return 0; +} + +/* + * Did it hit the DOS screen memory VA from vm86 mode? + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ + unsigned long bit; + + if (!v8086_mode(regs)) + return; + + bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; } -#endif static void dump_pagetable(unsigned long address) { -#ifdef CONFIG_X86_32 __typeof__(pte_val(__pte(0))) page; page = read_cr3(); @@ -239,7 +345,132 @@ static void dump_pagetable(unsigned long address) } printk("\n"); -#else /* CONFIG_X86_64 */ +} + +#else /* CONFIG_X86_64: */ + +void vmalloc_sync_all(void) +{ + unsigned long address; + + for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; + address += PGDIR_SIZE) { + + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* + * 64-bit: + * + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static noinline int vmalloc_fault(unsigned long address) +{ + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Copy kernel mappings over when needed. This can also + * happen within a race in page table update. In the later + * case just flush: + */ + pgd = pgd_offset(current->active_mm, address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + /* + * Below here mismatches are bugs because these lower tables + * are shared: + */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) + BUG(); + + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + + pte = pte_offset_kernel(pmd, address); + + /* + * Don't use pte_page here, because the mappings can point + * outside mem_map, and the NUMA hash lookup cannot handle + * that: + */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + + return 0; +} + +static const char errata93_warning[] = +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" +KERN_ERR "******* Please consider a BIOS update.\n" +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; + +/* + * No vm86 mode in 64-bit mode: + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ +} + +static int bad_address(void *p) +{ + unsigned long dummy; + + return probe_kernel_address((unsigned long *)p, dummy); +} + +static void dump_pagetable(unsigned long address) +{ pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -284,83 +515,9 @@ out: return; bad: printk("BAD\n"); -#endif -} - -#ifdef CONFIG_X86_32 -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) -{ - unsigned index = pgd_index(address); - pgd_t *pgd_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - - pgd += index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - return NULL; - - /* - * set_pgd(pgd, *pgd_k); here would be useless on PAE - * and redundant with the set_pmd() on non-PAE. As would - * set_pud. - */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); - if (!pud_present(*pud_k)) - return NULL; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - return NULL; - - if (!pmd_present(*pmd)) { - set_pmd(pmd, *pmd_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - } - - return pmd_k; -} - -/* - * Did it hit the DOS screen memory VA from vm86 mode? - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ - unsigned long bit; - - if (!v8086_mode(regs)) - return; - - bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; -} - -#else /* CONFIG_X86_64: */ - -static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; - -/* - * No vm86 mode in 64-bit mode: - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ } -#endif +#endif /* CONFIG_X86_64 */ /* * Workaround for K8 erratum #93 & buggy BIOS. @@ -795,109 +952,6 @@ spurious_fault(unsigned long error_code, unsigned long address) return ret; } -/* - * 32-bit: - * - * Handle a fault on the vmalloc or module mapping area - * - * 64-bit: - * - * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. - */ -static noinline int vmalloc_fault(unsigned long address) -{ -#ifdef CONFIG_X86_32 - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - - return 0; -#else - pgd_t *pgd, *pgd_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Copy kernel mappings over when needed. This can also - * happen within a race in page table update. In the later - * case just flush: - */ - pgd = pgd_offset(current->active_mm, address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) - return -1; - - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - - /* - * Below here mismatches are bugs because these lower tables - * are shared: - */ - - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); - if (pud_none(*pud_ref)) - return -1; - - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) - BUG(); - - pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) - return -1; - - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) - BUG(); - - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - - pte = pte_offset_kernel(pmd, address); - - /* - * Don't use pte_page here, because the mappings can point - * outside mem_map, and the NUMA hash lookup cannot handle - * that: - */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); - - return 0; -#endif -} - int show_unhandled_signals = 1; static inline int @@ -1115,53 +1169,3 @@ good_area: up_read(&mm->mmap_sem); } - -DEFINE_SPINLOCK(pgd_lock); -LIST_HEAD(pgd_list); - -void vmalloc_sync_all(void) -{ - unsigned long address; - -#ifdef CONFIG_X86_32 - if (SHARED_KERNEL_PMD) - return; - - for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE && address < FIXADDR_TOP; - address += PMD_SIZE) { - - unsigned long flags; - struct page *page; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), address)) - break; - } - spin_unlock_irqrestore(&pgd_lock, flags); - } -#else /* CONFIG_X86_64 */ - for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; - address += PGDIR_SIZE) { - - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - } -#endif -} -- cgit v1.2.3-70-g09d2 From 8f7661496cece8320137d5e26808825498fd2b26 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:00:29 +0100 Subject: x86, mm: fault.c, unify oops printing Impact: refine/extend page fault related oops printing on 64-bit - honor the pause_on_oops logic on 64-bit too - print out NX fault warnings on 64-bit as well - factor out the NX fault message to make it git-greppable and readable Note that this means that we do the PF_INSTR check on 32-bit non-PAE as well where it should not occur ... normally. Cannot hurt. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4ce62fb80da..ebfaca3bbb1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -595,28 +595,24 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } +static const char nx_warning[] = KERN_CRIT +"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; + static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { -#ifdef CONFIG_X86_32 if (!oops_may_print()) return; -#endif -#ifdef CONFIG_X86_PAE if (error_code & PF_INSTR) { unsigned int level; pte_t *pte = lookup_address(address, &level); - if (pte && pte_present(*pte) && !pte_exec(*pte)) { - printk(KERN_CRIT "kernel tried to execute " - "NX-protected page - exploit attempt? " - "(uid: %d)\n", current_uid()); - } + if (pte && pte_present(*pte) && !pte_exec(*pte)) + printk(nx_warning, current_uid()); } -#endif printk(KERN_ALERT "BUG: unable to handle kernel "); if (address < PAGE_SIZE) -- cgit v1.2.3-70-g09d2 From 1cc99544dde9e48602979f16b9309fade6e93051 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:07:48 +0100 Subject: x86, mm: fault.c, unify oops handling Impact: add oops-recursion check to 32-bit Unify the oops state-machine, to the 64-bit version. It is slightly more careful in that it does a recursion check in oops_begin(), and is thus more likely to show the relevant oops. It also means that 32-bit will print one more line at the end of pagefault triggered oopses: printk(KERN_EMERG "CR2: %016lx\n", address); Which is generally good information to be seen in partial-dump digital-camera jpegs ;-) The downside is the somewhat more complex critical path. Both variants have been tested well meanwhile by kernel developers crashing their boxes so i dont think this is a practical worry. This removes 3 ugly #ifdefs from no_context() and makes the function a lot nicer read. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ebfaca3bbb1..8fe2dd254df 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -659,11 +659,8 @@ no_context(struct pt_regs *regs, unsigned long error_code, { struct task_struct *tsk = current; unsigned long *stackend; - -#ifdef CONFIG_X86_64 unsigned long flags; int sig; -#endif /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) @@ -690,11 +687,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: */ -#ifdef CONFIG_X86_32 - bust_spinlocks(1); -#else flags = oops_begin(); -#endif show_fault_oops(regs, error_code, address); @@ -702,15 +695,10 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; -#ifdef CONFIG_X86_32 - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); -#else sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; @@ -719,7 +707,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, printk(KERN_EMERG "CR2: %016lx\n", address); oops_end(flags, regs, sig); -#endif } /* -- cgit v1.2.3-70-g09d2 From c3731c68668325abddee8665018c74c7156a57be Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:22:34 +0100 Subject: x86, mm: fault.c, remove #ifdef from do_page_fault() Impact: cleanup do_page_fault() has this ugly #ifdef in its prototype: #ifdef CONFIG_X86_64 asmlinkage #endif void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) Replace it with 'dotraplinkage' which maps to exactly the above construct: nothing on 32-bit and asmlinkage on 64-bit. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8fe2dd254df..9c2dc5d7953 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -972,10 +972,8 @@ static int fault_in_kernel_space(unsigned long address) * and the problem, and then passes it off to one of the appropriate * routines. */ -#ifdef CONFIG_X86_64 -asmlinkage -#endif -void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) +dotraplinkage void __kprobes +do_page_fault(struct pt_regs *regs, unsigned long error_code) { struct vm_area_struct *vma; struct task_struct *tsk; -- cgit v1.2.3-70-g09d2 From d951734654f76a370a89b4e531af9b765bd13541 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:32:28 +0100 Subject: x86, mm: rename TASK_SIZE64 => TASK_SIZE_MAX Impact: cleanup Rename TASK_SIZE64 to TASK_SIZE_MAX, and provide the define on 32-bit too. (mapped to TASK_SIZE) This allows 32-bit code to make use of the (former-) TASK_SIZE64 symbol as well, in a clean way. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 9 +++++---- arch/x86/kernel/ptrace.c | 2 +- arch/x86/mm/fault.c | 2 +- arch/x86/vdso/vma.c | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 72914d0315e..c7a98f73821 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -861,6 +861,7 @@ static inline void spin_lock_prefetch(const void *x) * User space process size: 3GB (default). */ #define TASK_SIZE PAGE_OFFSET +#define TASK_SIZE_MAX TASK_SIZE #define STACK_TOP TASK_SIZE #define STACK_TOP_MAX STACK_TOP @@ -920,7 +921,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); /* * User space process size. 47bits minus one guard page. */ -#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE) +#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. @@ -929,12 +930,12 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); 0xc0000000 : 0xFFFFe000) #define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ - IA32_PAGE_OFFSET : TASK_SIZE64) + IA32_PAGE_OFFSET : TASK_SIZE_MAX) #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ - IA32_PAGE_OFFSET : TASK_SIZE64) + IA32_PAGE_OFFSET : TASK_SIZE_MAX) #define STACK_TOP TASK_SIZE -#define STACK_TOP_MAX TASK_SIZE64 +#define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index d2f7cd5b2c8..fb2159a5c81 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -268,7 +268,7 @@ static unsigned long debugreg_addr_limit(struct task_struct *task) if (test_tsk_thread_flag(task, TIF_IA32)) return IA32_PAGE_OFFSET - 3; #endif - return TASK_SIZE64 - 7; + return TASK_SIZE_MAX - 7; } #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9c2dc5d7953..6fa9f175cba 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -963,7 +963,7 @@ static int fault_in_kernel_space(unsigned long address) #ifdef CONFIG_X86_32 return address >= TASK_SIZE; #else - return address >= TASK_SIZE64; + return address >= TASK_SIZE_MAX; #endif } diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 9c98cc6ba97..7133cdf9098 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -85,8 +85,8 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) unsigned long addr, end; unsigned offset; end = (start + PMD_SIZE - 1) & PMD_MASK; - if (end >= TASK_SIZE64) - end = TASK_SIZE64; + if (end >= TASK_SIZE_MAX) + end = TASK_SIZE_MAX; end -= len; /* This loses some more bits than a modulo, but is cheaper */ offset = get_random_int() & (PTRS_PER_PTE - 1); -- cgit v1.2.3-70-g09d2 From 7c178a26d3e753d2a4346d3e4b8aa549d387f698 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:27:29 +0100 Subject: x86, mm: fault.c, remove #ifdef from fault_in_kernel_space() Impact: cleanup Removal of an #ifdef in fault_in_kernel_space(), by making use of the new TASK_SIZE_MAX symbol which is now available on 32-bit too. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6fa9f175cba..f195691ec26 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -960,11 +960,7 @@ access_error(unsigned long error_code, int write, struct vm_area_struct *vma) static int fault_in_kernel_space(unsigned long address) { -#ifdef CONFIG_X86_32 - return address >= TASK_SIZE; -#else return address >= TASK_SIZE_MAX; -#endif } /* -- cgit v1.2.3-70-g09d2 From cd1b68f08f6328737928e5b8ba8ef80394680ff0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:39:02 +0100 Subject: x86, mm: fault.c, give another attempt at prefetch handing before SIGBUS Impact: extend prefetch handling on 64-bit Currently there's an extra is_prefetch() check done in do_sigbus(), which we only do on 32 bits. This is a last-ditch check before we terminate a task, so it's worth giving prefetch instructions another chance - should none of our existing quirks have caught a prefetch instruction related spurious fault. The only risk is if a prefetch causes a real sigbus, in that case we'll not OOM but try another fault. But this code has been on 32-bit for a long time, so it should be fine in practice. So do this on 64-bit too - and thus remove one more #ifdef. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f195691ec26..413e835e4a8 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -836,11 +836,9 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) if (!(error_code & PF_USER)) no_context(regs, error_code, address); -#ifdef CONFIG_X86_32 - /* User space => ok to do another page fault: */ + /* User-space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) return; -#endif tsk->thread.cr2 = address; tsk->thread.error_code = error_code; -- cgit v1.2.3-70-g09d2 From f8eeb2e6be367d79be3617f0a12646bced8b2fe6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:13:36 +0100 Subject: x86, mm: fault.c, update copyrights Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 413e835e4a8..9bb07d331c3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1,6 +1,7 @@ /* * Copyright (C) 1995 Linus Torvalds * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ #include #include -- cgit v1.2.3-70-g09d2 From 2366c298b5afe52e635afd5604b69ce9fd4471fc Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sun, 22 Feb 2009 01:01:13 +0100 Subject: x86: numa_32.c: fix sparse warning: Using plain integer as NULL pointer Fix this sparse warning: arch/x86/mm/numa_32.c:197:24: warning: Using plain integer as NULL pointer Signed-off-by: Hannes Eder Cc: trivial@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index d1f7439d173..3957cd6d645 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -194,7 +194,7 @@ void *alloc_remap(int nid, unsigned long size) size = ALIGN(size, L1_CACHE_BYTES); if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) - return 0; + return NULL; node_remap_alloc_vaddr[nid] += size; memset(allocation, 0, size); -- cgit v1.2.3-70-g09d2 From b319eed0aa0a6d710887350a3cb734c572aa64c4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 10:24:18 +0100 Subject: x86, mm: fault.c, simplify kmmio_fault(), cleanup Clarify the kmmio_fault() comment. Acked-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9bb07d331c3..a03b7279efa 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -57,7 +57,8 @@ enum x86_pf_error_code { }; /* - * (returns 0 if mmiotrace is disabled) + * Returns 0 if mmiotrace is disabled, or if the fault is not + * handled by mmiotrace: */ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -- cgit v1.2.3-70-g09d2 From 458a3e644c3327be529393982e24277eda8f1ac7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: x86: update populate_extra_pte() and add populate_extra_pmd() Impact: minor change to populate_extra_pte() and addition of pmd flavor Update populate_extra_pte() to return pointer to the pte_t for the specified address and add populate_extra_pmd() which only populates till the pmd and returns pointer to the pmd entry for the address. For 64bit, pud/pmd/pte fill functions are separated out from set_pte_vaddr[_pud]() and used for set_pte_vaddr[_pud]() and populate_extra_{pte|pmd}(). Signed-off-by: Tejun Heo --- arch/x86/include/asm/pgtable.h | 3 +- arch/x86/kernel/setup_percpu.c | 7 +++- arch/x86/mm/init_32.c | 13 ++++++-- arch/x86/mm/init_64.c | 75 +++++++++++++++++++++++++----------------- 4 files changed, 63 insertions(+), 35 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index dd91c2515c6..46312eb0d68 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -402,7 +402,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); -void populate_extra_pte(unsigned long vaddr); +pmd_t *populate_extra_pmd(unsigned long vaddr); +pte_t *populate_extra_pte(unsigned long vaddr); #ifdef CONFIG_X86_32 extern void native_pagetable_setup_start(pgd_t *base); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2dce4355821..671e6528a82 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -41,6 +41,11 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +static void __init pcpu4k_populate_pte(unsigned long addr) +{ + populate_extra_pte(addr); +} + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -104,7 +109,7 @@ void __init setup_per_cpu_areas(void) } } - pcpu_unit_size = pcpu_setup_static(populate_extra_pte, pages, size); + pcpu_unit_size = pcpu_setup_static(pcpu4k_populate_pte, pages, size); free_bootmem(__pa(pages), pages_size); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8b1a0ef7f87..84a26883ab4 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -137,14 +137,21 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) return pte_offset_kernel(pmd, 0); } -void __init populate_extra_pte(unsigned long vaddr) +pmd_t * __init populate_extra_pmd(unsigned long vaddr) { int pgd_idx = pgd_index(vaddr); int pmd_idx = pmd_index(vaddr); + + return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx; +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + int pte_idx = pte_index(vaddr); pmd_t *pmd; - pmd = one_md_table_init(swapper_pg_dir + pgd_idx); - one_page_table_init(pmd + pmd_idx); + pmd = populate_extra_pmd(vaddr); + return one_page_table_init(pmd) + pte_idx; } static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7f91e2cdc4c..7d4e76da336 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -168,34 +168,51 @@ static __ref void *spp_getpage(void) return ptr; } -void -set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + if (pgd_none(*pgd)) { + pud_t *pud = (pud_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, pud); + if (pud != pud_offset(pgd, 0)) + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); + } + return pud_offset(pgd, vaddr); +} - pud = pud_page + pud_index(vaddr); +static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr) +{ if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); + pmd_t *pmd = (pmd_t *) spp_getpage(); pud_populate(&init_mm, pud, pmd); - if (pmd != pmd_offset(pud, 0)) { + if (pmd != pmd_offset(pud, 0)) printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", - pmd, pmd_offset(pud, 0)); - return; - } + pmd, pmd_offset(pud, 0)); } - pmd = pmd_offset(pud, vaddr); + return pmd_offset(pud, vaddr); +} + +static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr) +{ if (pmd_none(*pmd)) { - pte = (pte_t *) spp_getpage(); + pte_t *pte = (pte_t *) spp_getpage(); pmd_populate_kernel(&init_mm, pmd, pte); - if (pte != pte_offset_kernel(pmd, 0)) { + if (pte != pte_offset_kernel(pmd, 0)) printk(KERN_ERR "PAGETABLE BUG #02!\n"); - return; - } } + return pte_offset_kernel(pmd, vaddr); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pud = pud_page + pud_index(vaddr); + pmd = fill_pmd(pud, vaddr); + pte = fill_pte(pmd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); set_pte(pte, new_pte); /* @@ -205,8 +222,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) __flush_tlb_one(vaddr); } -void -set_pte_vaddr(unsigned long vaddr, pte_t pteval) +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; pud_t *pud_page; @@ -223,23 +239,22 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval) set_pte_vaddr_pud(pud_page, vaddr, pteval); } -void __init populate_extra_pte(unsigned long vaddr) +pmd_t * __init populate_extra_pmd(unsigned long vaddr) { pgd_t *pgd; pud_t *pud; pgd = pgd_offset_k(vaddr); - if (pgd_none(*pgd)) { - pud = (pud_t *)spp_getpage(); - pgd_populate(&init_mm, pgd, pud); - if (pud != pud_offset(pgd, 0)) { - printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", - pud, pud_offset(pgd, 0)); - return; - } - } + pud = fill_pud(pgd, vaddr); + return fill_pmd(pud, vaddr); +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + pmd_t *pmd; - set_pte_vaddr_pud((pud_t *)pgd_page_vaddr(*pgd), vaddr, __pte(0)); + pmd = populate_extra_pmd(vaddr); + return fill_pte(pmd, vaddr); } /* -- cgit v1.2.3-70-g09d2 From 40823f737e5bd186a1156fb1c28f360260e1e084 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:26:18 +0100 Subject: x86: memtest: reuse test patterns when memtest parameter exceeds number of available patterns Impact: fix unexpected behaviour when pattern number is out of range Current implementation provides 4 patterns for memtest. The code doesn't check whether the memtest parameter value exceeds the maximum pattern number. Instead the memtest code pretends to test with non-existing patterns, e.g. when booting with memtest=10 I've observed the following ... early_memtest: pattern num 10 0000001000 - 0000006000 pattern 0 ... 0000001000 - 0000006000 pattern 1 ... 0000001000 - 0000006000 pattern 2 ... 0000001000 - 0000006000 pattern 3 ... 0000001000 - 0000006000 pattern 4 ... 0000001000 - 0000006000 pattern 5 ... 0000001000 - 0000006000 pattern 6 ... 0000001000 - 0000006000 pattern 7 ... 0000001000 - 0000006000 pattern 8 ... 0000001000 - 0000006000 pattern 9 ... But in fact Linux didn't test anything for patterns > 4 as the default case in memtest() is to leave the function. I suggest to use the memtest parameter as the number of tests to be performed and to re-iterate over all existing patterns. Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 9cab18b0b85..00b8bdc64c3 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -9,6 +9,8 @@ #include +#define _MAX_MEM_PATTERNS 4 + static void __init memtest(unsigned long start_phys, unsigned long size, unsigned pattern) { @@ -21,6 +23,8 @@ static void __init memtest(unsigned long start_phys, unsigned long size, unsigned long count; unsigned long incr; + pattern = pattern % _MAX_MEM_PATTERNS; + switch (pattern) { case 0: val = 0UL; @@ -110,8 +114,9 @@ void __init early_memtest(unsigned long start, unsigned long end) t_size = end - t_start; printk(KERN_CONT "\n %010llx - %010llx pattern %d", - (unsigned long long)t_start, - (unsigned long long)t_start + t_size, pattern); + (unsigned long long)t_start, + (unsigned long long)t_start + t_size, + pattern % _MAX_MEM_PATTERNS); memtest(t_start, t_size, pattern); -- cgit v1.2.3-70-g09d2 From 6d74171bf7315257d276aa35400c5a8d6a993f19 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:27:27 +0100 Subject: x86: memtest: introduce array to select memtest patterns Impact: code cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 65 ++++++++++++++++++--------------------------------- 1 file changed, 23 insertions(+), 42 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 00b8bdc64c3..827f94044cf 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -9,48 +9,25 @@ #include -#define _MAX_MEM_PATTERNS 4 +static u64 patterns[] __initdata = { + 0, + 0xffffffffffffffffULL, + 0x5555555555555555ULL, + 0xaaaaaaaaaaaaaaaaULL, +}; static void __init memtest(unsigned long start_phys, unsigned long size, - unsigned pattern) + u64 pattern) { unsigned long i; - unsigned long *start; + u64 *start; unsigned long start_bad; unsigned long last_bad; - unsigned long val; unsigned long start_phys_aligned; unsigned long count; unsigned long incr; - pattern = pattern % _MAX_MEM_PATTERNS; - - switch (pattern) { - case 0: - val = 0UL; - break; - case 1: - val = -1UL; - break; - case 2: -#ifdef CONFIG_X86_64 - val = 0x5555555555555555UL; -#else - val = 0x55555555UL; -#endif - break; - case 3: -#ifdef CONFIG_X86_64 - val = 0xaaaaaaaaaaaaaaaaUL; -#else - val = 0xaaaaaaaaUL; -#endif - break; - default: - return; - } - - incr = sizeof(unsigned long); + incr = sizeof(pattern); start_phys_aligned = ALIGN(start_phys, incr); count = (size - (start_phys_aligned - start_phys))/incr; start = __va(start_phys_aligned); @@ -58,15 +35,16 @@ static void __init memtest(unsigned long start_phys, unsigned long size, last_bad = 0; for (i = 0; i < count; i++) - start[i] = val; + start[i] = pattern; for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { - if (*start != val) { + if (*start != pattern) { if (start_phys_aligned == last_bad + incr) { last_bad += incr; } else { if (start_bad) { - printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", - val, start_bad, last_bad + incr); + printk(KERN_CONT "\n %016llx bad mem addr %010lx - %010lx reserved", + (unsigned long long) pattern, + start_bad, last_bad + incr); reserve_early(start_bad, last_bad + incr, "BAD RAM"); } start_bad = last_bad = start_phys_aligned; @@ -74,8 +52,9 @@ static void __init memtest(unsigned long start_phys, unsigned long size, } } if (start_bad) { - printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", - val, start_bad, last_bad + incr); + printk(KERN_CONT "\n %016llx bad mem addr %010lx - %010lx reserved", + (unsigned long long) pattern, start_bad, + last_bad + incr); reserve_early(start_bad, last_bad + incr, "BAD RAM"); } } @@ -95,13 +74,16 @@ early_param("memtest", parse_memtest); void __init early_memtest(unsigned long start, unsigned long end) { u64 t_start, t_size; - unsigned pattern; + unsigned int i; + u64 pattern; if (!memtest_pattern) return; printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); - for (pattern = 0; pattern < memtest_pattern; pattern++) { + for (i = 0; i < memtest_pattern; i++) { + unsigned int idx = i % ARRAY_SIZE(patterns); + pattern = patterns[idx]; t_start = start; t_size = 0; while (t_start < end) { @@ -115,8 +97,7 @@ void __init early_memtest(unsigned long start, unsigned long end) printk(KERN_CONT "\n %010llx - %010llx pattern %d", (unsigned long long)t_start, - (unsigned long long)t_start + t_size, - pattern % _MAX_MEM_PATTERNS); + (unsigned long long)t_start + t_size, idx); memtest(t_start, t_size, pattern); -- cgit v1.2.3-70-g09d2 From 7dad169e57eda1f0aa6dc5ac43a898b4b0ced2c7 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:28:07 +0100 Subject: x86: memtest: cleanup memtest function Impact: code cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 827f94044cf..01a72d6825b 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -16,6 +16,15 @@ static u64 patterns[] __initdata = { 0xaaaaaaaaaaaaaaaaULL, }; +static void __init reserve_bad_mem(u64 pattern, unsigned long start_bad, + unsigned long end_bad) +{ + printk(KERN_CONT "\n %016llx bad mem addr " + "%010lx - %010lx reserved", + (unsigned long long) pattern, start_bad, end_bad); + reserve_early(start_bad, end_bad, "BAD RAM"); +} + static void __init memtest(unsigned long start_phys, unsigned long size, u64 pattern) { @@ -37,26 +46,18 @@ static void __init memtest(unsigned long start_phys, unsigned long size, for (i = 0; i < count; i++) start[i] = pattern; for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { - if (*start != pattern) { - if (start_phys_aligned == last_bad + incr) { - last_bad += incr; - } else { - if (start_bad) { - printk(KERN_CONT "\n %016llx bad mem addr %010lx - %010lx reserved", - (unsigned long long) pattern, - start_bad, last_bad + incr); - reserve_early(start_bad, last_bad + incr, "BAD RAM"); - } - start_bad = last_bad = start_phys_aligned; - } + if (*start == pattern) + continue; + if (start_phys_aligned == last_bad + incr) { + last_bad += incr; + continue; } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); + start_bad = last_bad = start_phys_aligned; } - if (start_bad) { - printk(KERN_CONT "\n %016llx bad mem addr %010lx - %010lx reserved", - (unsigned long long) pattern, start_bad, - last_bad + incr); - reserve_early(start_bad, last_bad + incr, "BAD RAM"); - } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); } /* default is disabled */ -- cgit v1.2.3-70-g09d2 From 570c9e69aaa84689fb8ed2a3a4af39ca54ba7a47 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:28:58 +0100 Subject: x86: memtest: adapt log messages - print test pattern instead of pattern number, - show pattern as stored in memory, - use proper priority flags, - consistent use of u64 throughout the code Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 01a72d6825b..3232397783a 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -16,25 +16,22 @@ static u64 patterns[] __initdata = { 0xaaaaaaaaaaaaaaaaULL, }; -static void __init reserve_bad_mem(u64 pattern, unsigned long start_bad, - unsigned long end_bad) +static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) { - printk(KERN_CONT "\n %016llx bad mem addr " - "%010lx - %010lx reserved", - (unsigned long long) pattern, start_bad, end_bad); + printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", + (unsigned long long) pattern, + (unsigned long long) start_bad, + (unsigned long long) end_bad); reserve_early(start_bad, end_bad, "BAD RAM"); } -static void __init memtest(unsigned long start_phys, unsigned long size, - u64 pattern) +static void __init memtest(u64 pattern, u64 start_phys, u64 size) { - unsigned long i; + u64 i, count; u64 *start; - unsigned long start_bad; - unsigned long last_bad; - unsigned long start_phys_aligned; - unsigned long count; - unsigned long incr; + u64 start_bad, last_bad; + u64 start_phys_aligned; + size_t incr; incr = sizeof(pattern); start_phys_aligned = ALIGN(start_phys, incr); @@ -81,7 +78,7 @@ void __init early_memtest(unsigned long start, unsigned long end) if (!memtest_pattern) return; - printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); + printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); for (i = 0; i < memtest_pattern; i++) { unsigned int idx = i % ARRAY_SIZE(patterns); pattern = patterns[idx]; @@ -96,14 +93,13 @@ void __init early_memtest(unsigned long start, unsigned long end) if (t_start + t_size > end) t_size = end - t_start; - printk(KERN_CONT "\n %010llx - %010llx pattern %d", - (unsigned long long)t_start, - (unsigned long long)t_start + t_size, idx); - - memtest(t_start, t_size, pattern); + printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", + (unsigned long long) t_start, + (unsigned long long) t_start + t_size, + (unsigned long long) cpu_to_be64(pattern)); + memtest(pattern, t_start, t_size); t_start += t_size; } } - printk(KERN_CONT "\n"); } -- cgit v1.2.3-70-g09d2 From bfb4dc0da45f8fddc76eba7e62919420c7d6dae2 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:30:04 +0100 Subject: x86: memtest: wipe out test pattern from memory Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 56 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 23 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 3232397783a..9c52ef19e6f 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -57,6 +57,29 @@ static void __init memtest(u64 pattern, u64 start_phys, u64 size) reserve_bad_mem(pattern, start_bad, last_bad + incr); } +static void __init do_one_pass(u64 pattern, u64 start, u64 end) +{ + u64 size = 0; + + while (start < end) { + start = find_e820_area_size(start, &size, 1); + + /* done ? */ + if (start >= end) + break; + if (start + size > end) + size = end - start; + + printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", + (unsigned long long) start, + (unsigned long long) start + size, + (unsigned long long) cpu_to_be64(pattern)); + memtest(pattern, start, size); + + start += size; + } +} + /* default is disabled */ static int memtest_pattern __initdata; @@ -71,35 +94,22 @@ early_param("memtest", parse_memtest); void __init early_memtest(unsigned long start, unsigned long end) { - u64 t_start, t_size; unsigned int i; - u64 pattern; + unsigned int idx = 0; if (!memtest_pattern) return; printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); for (i = 0; i < memtest_pattern; i++) { - unsigned int idx = i % ARRAY_SIZE(patterns); - pattern = patterns[idx]; - t_start = start; - t_size = 0; - while (t_start < end) { - t_start = find_e820_area_size(t_start, &t_size, 1); - - /* done ? */ - if (t_start >= end) - break; - if (t_start + t_size > end) - t_size = end - t_start; - - printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", - (unsigned long long) t_start, - (unsigned long long) t_start + t_size, - (unsigned long long) cpu_to_be64(pattern)); - memtest(pattern, t_start, t_size); - - t_start += t_size; - } + idx = i % ARRAY_SIZE(patterns); + do_one_pass(patterns[idx], start, end); + } + + if (idx > 0) { + printk(KERN_INFO "early_memtest: wipe out " + "test pattern from memory\n"); + /* additional test with pattern 0 will do this */ + do_one_pass(0, start, end); } } -- cgit v1.2.3-70-g09d2 From 63823126c221dd721ce7351b596b3b73aa943613 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:31:49 +0100 Subject: x86: memtest: add additional (regular) test patterns Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 9c52ef19e6f..0bcd7883d03 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -14,6 +14,19 @@ static u64 patterns[] __initdata = { 0xffffffffffffffffULL, 0x5555555555555555ULL, 0xaaaaaaaaaaaaaaaaULL, + 0x1111111111111111ULL, + 0x2222222222222222ULL, + 0x4444444444444444ULL, + 0x8888888888888888ULL, + 0x3333333333333333ULL, + 0x6666666666666666ULL, + 0x9999999999999999ULL, + 0xccccccccccccccccULL, + 0x7777777777777777ULL, + 0xbbbbbbbbbbbbbbbbULL, + 0xddddddddddddddddULL, + 0xeeeeeeeeeeeeeeeeULL, + 0x7a6c7258554e494cULL, /* yeah ;-) */ }; static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) -- cgit v1.2.3-70-g09d2 From 7880f7464546842ee14179bef16a6e14381ea638 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 24 Feb 2009 17:35:13 -0800 Subject: gpu/drm, x86, PAT: routine to keep identity map in sync Add a function to check and keep identity maps in sync, when changing any memory type. One of the follow on patches will also use this routine. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Cc: Dave Airlie Cc: Jesse Barnes Cc: Eric Anholt Cc: Keith Packard Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pat.h | 3 +++ arch/x86/mm/pat.c | 46 +++++++++++++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 17 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index b8493b3b989..abb3c29fc9d 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -19,4 +19,7 @@ extern int free_memtype(u64 start, u64 end); extern void pat_disable(char *reason); +extern int kernel_map_sync_memtype(u64 base, unsigned long size, + unsigned long flag); + #endif /* _ASM_X86_PAT_H */ diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index aebbf67a79d..399383c6f61 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -624,6 +624,33 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) free_memtype(addr, addr + size); } +/* + * Change the memory type for the physial address range in kernel identity + * mapping space if that range is a part of identity map. + */ +int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) +{ + unsigned long id_sz; + + if (!pat_enabled || base >= __pa(high_memory)) + return 0; + + id_sz = (__pa(high_memory) < base + size) ? + __pa(high_memory) - base : + size; + + if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) { + printk(KERN_INFO + "%s:%d ioremap_change_attr failed %s " + "for %Lx-%Lx\n", + current->comm, current->pid, + cattr_name(flags), + base, (unsigned long long)(base + size)); + return -EINVAL; + } + return 0; +} + /* * Internal interface to reserve a range of physical memory with prot. * Reserved non RAM regions only and after successful reserve_memtype, @@ -633,7 +660,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, int strict_prot) { int is_ram = 0; - int id_sz, ret; + int ret; unsigned long flags; unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); @@ -670,23 +697,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, flags); } - /* Need to keep identity mapping in sync */ - if (paddr >= __pa(high_memory)) - return 0; - - id_sz = (__pa(high_memory) < paddr + size) ? - __pa(high_memory) - paddr : - size; - - if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) { + if (kernel_map_sync_memtype(paddr, size, flags) < 0) { free_memtype(paddr, paddr + size); - printk(KERN_ERR - "%s:%d reserve_pfn_range ioremap_change_attr failed %s " - "for %Lx-%Lx\n", - current->comm, current->pid, - cattr_name(flags), - (unsigned long long)paddr, - (unsigned long long)(paddr + size)); return -EINVAL; } return 0; -- cgit v1.2.3-70-g09d2 From 17581ad812a9abb0182260374ef2e52d4a808a64 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 24 Feb 2009 17:35:14 -0800 Subject: gpu/drm, x86, PAT: PAT support for io_mapping_* Make io_mapping_create_wc and io_mapping_free go through PAT to make sure that there are no memory type aliases. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Cc: Dave Airlie Cc: Jesse Barnes Cc: Eric Anholt Cc: Keith Packard Signed-off-by: Ingo Molnar --- arch/x86/include/asm/iomap.h | 5 ++++- arch/x86/mm/iomap_32.c | 42 ++++++++++++++++++++++++++++++++++++++++-- include/linux/io-mapping.h | 6 ++++-- 3 files changed, 48 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index 86af26091d6..bd46495ff7d 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -24,7 +24,10 @@ #include int -is_io_mapping_possible(resource_size_t base, unsigned long size); +reserve_io_memtype_wc(u64 base, unsigned long size, pgprot_t *prot); + +void +free_io_memtype(u64 base, unsigned long size); void * iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 6c2b1af1692..94596f794b1 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -21,13 +21,13 @@ #include #ifdef CONFIG_X86_PAE -int +static int is_io_mapping_possible(resource_size_t base, unsigned long size) { return 1; } #else -int +static int is_io_mapping_possible(resource_size_t base, unsigned long size) { /* There is no way to map greater than 1 << 32 address without PAE */ @@ -38,6 +38,44 @@ is_io_mapping_possible(resource_size_t base, unsigned long size) } #endif +int +reserve_io_memtype_wc(u64 base, unsigned long size, pgprot_t *prot) +{ + unsigned long ret_flag; + + if (!is_io_mapping_possible(base, size)) + goto out_err; + + if (!pat_enabled) { + *prot = pgprot_noncached(PAGE_KERNEL); + return 0; + } + + if (reserve_memtype(base, base + size, _PAGE_CACHE_WC, &ret_flag)) + goto out_err; + + if (ret_flag == _PAGE_CACHE_WB) + goto out_free; + + if (kernel_map_sync_memtype(base, size, ret_flag)) + goto out_free; + + *prot = __pgprot(__PAGE_KERNEL | ret_flag); + return 0; + +out_free: + free_memtype(base, base + size); +out_err: + return -EINVAL; +} + +void +free_io_memtype(u64 base, unsigned long size) +{ + if (pat_enabled) + free_memtype(base, base + size); +} + /* Map 'pfn' using fixed map 'type' and protections 'prot' */ void * diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index cbc2f0cd631..f1ed66c4378 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -49,8 +49,9 @@ static inline struct io_mapping * io_mapping_create_wc(resource_size_t base, unsigned long size) { struct io_mapping *iomap; + pgprot_t prot; - if (!is_io_mapping_possible(base, size)) + if (!reserve_io_memtype_wc(base, size, &prot)) return NULL; iomap = kmalloc(sizeof(*iomap), GFP_KERNEL); @@ -59,13 +60,14 @@ io_mapping_create_wc(resource_size_t base, unsigned long size) iomap->base = base; iomap->size = size; - iomap->prot = pgprot_writecombine(__pgprot(__PAGE_KERNEL)); + iomap->prot = prot; return iomap; } static inline void io_mapping_free(struct io_mapping *mapping) { + free_io_memtype(mapping->base, mapping->size); kfree(mapping); } -- cgit v1.2.3-70-g09d2 From 13093cb0e59053bf97910de3a24f07cdff71c62c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 26 Feb 2009 03:16:47 +0100 Subject: gpu/drm, x86, PAT: PAT support for io_mapping_*, export symbols for modules Impact: build fix ERROR: "reserve_io_memtype_wc" [drivers/gpu/drm/i915/i915.ko] undefined! ERROR: "free_io_memtype" [drivers/gpu/drm/i915/i915.ko] undefined! Signed-off-by: Ingo Molnar --- arch/x86/mm/iomap_32.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 94596f794b1..d5e28424622 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -68,6 +68,7 @@ out_free: out_err: return -EINVAL; } +EXPORT_SYMBOL_GPL(reserve_io_memtype_wc); void free_io_memtype(u64 base, unsigned long size) @@ -75,6 +76,7 @@ free_io_memtype(u64 base, unsigned long size) if (pat_enabled) free_memtype(base, base + size); } +EXPORT_SYMBOL_GPL(free_io_memtype); /* Map 'pfn' using fixed map 'type' and protections 'prot' */ -- cgit v1.2.3-70-g09d2 From fd862dde18c3e360f510780e1d1bf615866b11c2 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Sun, 15 Feb 2009 21:48:54 -0300 Subject: x86, fixmap: define reserve_top_address for x86_64 Impact: new interface (not yet use) Define reserve_top_address for x86_64; only for later x86 integration. Signed-off-by: Gustavo F. Padovan Acked-by: Glauber Costa Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/fixmap_64.h | 2 ++ arch/x86/mm/pgtable.c | 18 ++++++++++++++++++ arch/x86/mm/pgtable_32.c | 16 ---------------- 3 files changed, 20 insertions(+), 16 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/fixmap_64.h b/arch/x86/include/asm/fixmap_64.h index d905de08675..b6c06dd55f1 100644 --- a/arch/x86/include/asm/fixmap_64.h +++ b/arch/x86/include/asm/fixmap_64.h @@ -68,6 +68,8 @@ enum fixed_addresses { __end_of_fixed_addresses }; +extern void reserve_top_address(unsigned long reserve); + #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) #define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 86f2ffc43c3..5b7c7c8464f 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -313,6 +313,24 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, return young; } +/** + * reserve_top_address - reserves a hole in the top of kernel address space + * @reserve - size of hole to reserve + * + * Can be used to relocate the fixmap area and poke a hole in the top + * of kernel address space to make room for a hypervisor. + */ +void __init reserve_top_address(unsigned long reserve) +{ +#ifdef CONFIG_X86_32 + BUG_ON(fixmaps_set > 0); + printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", + (int)-reserve); + __FIXADDR_TOP = -reserve - PAGE_SIZE; + __VMALLOC_RESERVE += reserve; +#endif +} + int fixmaps_set; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 0951db9ee51..c3cf6e11576 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -97,22 +97,6 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) unsigned long __FIXADDR_TOP = 0xfffff000; EXPORT_SYMBOL(__FIXADDR_TOP); -/** - * reserve_top_address - reserves a hole in the top of kernel address space - * @reserve - size of hole to reserve - * - * Can be used to relocate the fixmap area and poke a hole in the top - * of kernel address space to make room for a hypervisor. - */ -void __init reserve_top_address(unsigned long reserve) -{ - BUG_ON(fixmaps_set > 0); - printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", - (int)-reserve); - __FIXADDR_TOP = -reserve - PAGE_SIZE; - __VMALLOC_RESERVE += reserve; -} - /* * vmalloc=size forces the vmalloc area to be exactly 'size' * bytes. This can be used to increase (or decrease) the -- cgit v1.2.3-70-g09d2 From f5c1aa1537be39d8b9bb5279b5881d81898fd3cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 1 Mar 2009 12:32:08 +0100 Subject: Revert "gpu/drm, x86, PAT: PAT support for io_mapping_*" This reverts commit 17581ad812a9abb0182260374ef2e52d4a808a64. Sitsofe Wheeler reported that /dev/dri/card0 is MIA on his EeePC 900 and bisected it to this commit. Graphics card is an i915 in an EeePC 900: 00:02.0 VGA compatible controller [0300]: Intel Corporation Mobile 915GM/GMS/910GML Express Graphics Controller [8086:2592] (rev 04) ( Most likely the ioremap() of the driver failed and hence the card did not initialize. ) Reported-by: Sitsofe Wheeler Bisected-by: Sitsofe Wheeler Cc: Venkatesh Pallipadi Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/include/asm/iomap.h | 5 +---- arch/x86/mm/iomap_32.c | 44 ++------------------------------------------ include/linux/io-mapping.h | 6 ++---- 3 files changed, 5 insertions(+), 50 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index bd46495ff7d..86af26091d6 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -24,10 +24,7 @@ #include int -reserve_io_memtype_wc(u64 base, unsigned long size, pgprot_t *prot); - -void -free_io_memtype(u64 base, unsigned long size); +is_io_mapping_possible(resource_size_t base, unsigned long size); void * iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index d5e28424622..6c2b1af1692 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -21,13 +21,13 @@ #include #ifdef CONFIG_X86_PAE -static int +int is_io_mapping_possible(resource_size_t base, unsigned long size) { return 1; } #else -static int +int is_io_mapping_possible(resource_size_t base, unsigned long size) { /* There is no way to map greater than 1 << 32 address without PAE */ @@ -38,46 +38,6 @@ is_io_mapping_possible(resource_size_t base, unsigned long size) } #endif -int -reserve_io_memtype_wc(u64 base, unsigned long size, pgprot_t *prot) -{ - unsigned long ret_flag; - - if (!is_io_mapping_possible(base, size)) - goto out_err; - - if (!pat_enabled) { - *prot = pgprot_noncached(PAGE_KERNEL); - return 0; - } - - if (reserve_memtype(base, base + size, _PAGE_CACHE_WC, &ret_flag)) - goto out_err; - - if (ret_flag == _PAGE_CACHE_WB) - goto out_free; - - if (kernel_map_sync_memtype(base, size, ret_flag)) - goto out_free; - - *prot = __pgprot(__PAGE_KERNEL | ret_flag); - return 0; - -out_free: - free_memtype(base, base + size); -out_err: - return -EINVAL; -} -EXPORT_SYMBOL_GPL(reserve_io_memtype_wc); - -void -free_io_memtype(u64 base, unsigned long size) -{ - if (pat_enabled) - free_memtype(base, base + size); -} -EXPORT_SYMBOL_GPL(free_io_memtype); - /* Map 'pfn' using fixed map 'type' and protections 'prot' */ void * diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index f1ed66c4378..cbc2f0cd631 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -49,9 +49,8 @@ static inline struct io_mapping * io_mapping_create_wc(resource_size_t base, unsigned long size) { struct io_mapping *iomap; - pgprot_t prot; - if (!reserve_io_memtype_wc(base, size, &prot)) + if (!is_io_mapping_possible(base, size)) return NULL; iomap = kmalloc(sizeof(*iomap), GFP_KERNEL); @@ -60,14 +59,13 @@ io_mapping_create_wc(resource_size_t base, unsigned long size) iomap->base = base; iomap->size = size; - iomap->prot = prot; + iomap->prot = pgprot_writecombine(__pgprot(__PAGE_KERNEL)); return iomap; } static inline void io_mapping_free(struct io_mapping *mapping) { - free_io_memtype(mapping->base, mapping->size); kfree(mapping); } -- cgit v1.2.3-70-g09d2 From 2b688dfd0a93cf3b17c38feef693361da47b0606 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 12:55:04 +0200 Subject: x86: move __VMALLOC_RESERVE to pgtable_32.c Impact: cleanup The __VMALLOC_RESERVE global variable is not used in init_32.c. Move that to pgtable_32.c to reduce the diff between init_32.c and init_64.c. Signed-off-by: Pekka Enberg LKML-Reference: <1236077704.2675.4.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 -- arch/x86/mm/pgtable_32.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 06708ee94aa..5b06a2f5dea 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -50,8 +50,6 @@ #include #include -unsigned int __VMALLOC_RESERVE = 128 << 20; - unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 0951db9ee51..e4032c886c3 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -20,6 +20,8 @@ #include #include +unsigned int __VMALLOC_RESERVE = 128 << 20; + /* * Associate a virtual page frame with a given physical page frame * and protection flags for that frame. -- cgit v1.2.3-70-g09d2 From fd578f9c0a0a7bf3e460e6f21cdc6f4018949e80 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 12:55:05 +0200 Subject: x86: use roundup() instead of PAGE_ALIGN() in find_early_table_space() Impact: cleanup This patch changes find_early_table_space() to use roundup() for rounding up tables to page size to unify the common parts of the 32-bit and 64-bit implementations. Signed-off-by: Pekka Enberg LKML-Reference: <1236077705.2675.6.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5b06a2f5dea..1dd6b6334dc 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -845,10 +845,10 @@ static void __init find_early_table_space(unsigned long end, int use_pse) unsigned long puds, pmds, ptes, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = PAGE_ALIGN(puds * sizeof(pud_t)); + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); if (use_pse) { unsigned long extra; @@ -859,10 +859,10 @@ static void __init find_early_table_space(unsigned long end, int use_pse) } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - tables += PAGE_ALIGN(ptes * sizeof(pte_t)); + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); /* for fixmap */ - tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t)); + tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); /* * RED-PEN putting page tables only on node 0 could -- cgit v1.2.3-70-g09d2 From 05f209e7b936a48e341d36831079116a06658ccc Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 13:15:02 +0200 Subject: x86: add sanity checks to init_32.c Impact: unification This patch adds sanity checks that are already in init_64.c to init_32.c. Signed-off-by: Pekka Enberg LKML-Reference: <1236078902.2675.16.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 1dd6b6334dc..1570a822c18 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1214,18 +1214,21 @@ void mark_rodata_ro(void) void free_init_pages(char *what, unsigned long begin, unsigned long end) { -#ifdef CONFIG_DEBUG_PAGEALLOC + unsigned long addr = begin; + + if (addr >= end) + return; + /* * If debugging page accesses then do not free this memory but * mark them not present - any buggy init-section access will * create a kernel page fault: */ +#ifdef CONFIG_DEBUG_PAGEALLOC printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", begin, PAGE_ALIGN(end)); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); #else - unsigned long addr; - /* * We just marked the kernel text read only above, now that * we are going to free part of that, we need to make that @@ -1233,14 +1236,16 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) */ set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); - for (addr = begin; addr < end; addr += PAGE_SIZE) { + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); + + for (; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); - memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); + memset((void *)(addr & ~(PAGE_SIZE-1)), + POISON_FREE_INITMEM, PAGE_SIZE); free_page(addr); totalram_pages++; } - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); #endif } -- cgit v1.2.3-70-g09d2 From e087edd8c056292191bb989baf49f83ee509e624 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 13:15:04 +0200 Subject: x86: make sure initmem is writable on 64-bit Impact: unification This patch ports commit 3c1df68b848b39270752ff8d4b956cc4a4dce0f6 ("x86: make sure initmem is writable") to the 64-bit version to unify implementations of free_init_pages(). Signed-off-by: Pekka Enberg Cc: Arjan van de Ven LKML-Reference: <1236078904.2675.17.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e6d36b49025..03da9030d0e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -962,6 +962,13 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) begin, PAGE_ALIGN(end)); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); #else + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable first. + */ + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); for (; addr < end; addr += PAGE_SIZE) { -- cgit v1.2.3-70-g09d2 From e5b2bb552706ca0e30795ee84caacbb37cec5705 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 13:15:06 +0200 Subject: x86: unify free_init_pages() and free_initmem() Impact: unification This patch introduces a common arch/x86/mm/init.c and moves the identical free_init_pages() and free_initmem() functions to the file. Signed-off-by: Pekka Enberg LKML-Reference: <1236078906.2675.18.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/Makefile | 2 +- arch/x86/mm/init.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/init_32.c | 44 -------------------------------------------- arch/x86/mm/init_64.c | 44 -------------------------------------------- 4 files changed, 50 insertions(+), 89 deletions(-) create mode 100644 arch/x86/mm/init.c (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 2b938a38491..08537747cb5 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,4 +1,4 @@ -obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ +obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o gup.o obj-$(CONFIG_SMP) += tlb.o diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c new file mode 100644 index 00000000000..ce6a722587d --- /dev/null +++ b/arch/x86/mm/init.c @@ -0,0 +1,49 @@ +#include +#include +#include +#include +#include + +void free_init_pages(char *what, unsigned long begin, unsigned long end) +{ + unsigned long addr = begin; + + if (addr >= end) + return; + + /* + * If debugging page accesses then do not free this memory but + * mark them not present - any buggy init-section access will + * create a kernel page fault: + */ +#ifdef CONFIG_DEBUG_PAGEALLOC + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", + begin, PAGE_ALIGN(end)); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); +#else + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable first. + */ + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); + + for (; addr < end; addr += PAGE_SIZE) { + ClearPageReserved(virt_to_page(addr)); + init_page_count(virt_to_page(addr)); + memset((void *)(addr & ~(PAGE_SIZE-1)), + POISON_FREE_INITMEM, PAGE_SIZE); + free_page(addr); + totalram_pages++; + } +#endif +} + +void free_initmem(void) +{ + free_init_pages("unused kernel memory", + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); +} diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 1570a822c18..cd8d6732613 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1212,50 +1212,6 @@ void mark_rodata_ro(void) } #endif -void free_init_pages(char *what, unsigned long begin, unsigned long end) -{ - unsigned long addr = begin; - - if (addr >= end) - return; - - /* - * If debugging page accesses then do not free this memory but - * mark them not present - any buggy init-section access will - * create a kernel page fault: - */ -#ifdef CONFIG_DEBUG_PAGEALLOC - printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", - begin, PAGE_ALIGN(end)); - set_memory_np(begin, (end - begin) >> PAGE_SHIFT); -#else - /* - * We just marked the kernel text read only above, now that - * we are going to free part of that, we need to make that - * writeable first. - */ - set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); - - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - - for (; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); - totalram_pages++; - } -#endif -} - -void free_initmem(void) -{ - free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); -} - #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 03da9030d0e..aae87456d93 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -945,50 +945,6 @@ void __init mem_init(void) initsize >> 10); } -void free_init_pages(char *what, unsigned long begin, unsigned long end) -{ - unsigned long addr = begin; - - if (addr >= end) - return; - - /* - * If debugging page accesses then do not free this memory but - * mark them not present - any buggy init-section access will - * create a kernel page fault: - */ -#ifdef CONFIG_DEBUG_PAGEALLOC - printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", - begin, PAGE_ALIGN(end)); - set_memory_np(begin, (end - begin) >> PAGE_SHIFT); -#else - /* - * We just marked the kernel text read only above, now that - * we are going to free part of that, we need to make that - * writeable first. - */ - set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); - - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - - for (; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); - totalram_pages++; - } -#endif -} - -void free_initmem(void) -{ - free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); -} - #ifdef CONFIG_DEBUG_RODATA const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); -- cgit v1.2.3-70-g09d2 From 867c5b5292583b1e474cbbcb4c77f09bfca3903c Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 14:10:12 +0200 Subject: x86: set_highmem_pages_init() cleanup Impact: cleanup This patch moves set_highmem_pages_init() to arch/x86/mm/highmem_32.c. The declaration of the function is kept in asm/numa_32.h because asm/highmem.h is included only if CONFIG_HIGHMEM is enabled so we can't put the empty static inline function there. Signed-off-by: Pekka Enberg LKML-Reference: <1236082212.2675.24.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/numa_32.h | 6 +++++- arch/x86/mm/highmem_32.c | 34 ++++++++++++++++++++++++++++++++++ arch/x86/mm/init_32.c | 12 ------------ arch/x86/mm/numa_32.c | 26 -------------------------- 4 files changed, 39 insertions(+), 39 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index e9f5db79624..a37229011b5 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h @@ -4,8 +4,12 @@ extern int pxm_to_nid(int pxm); extern void numa_remove_cpu(int cpu); -#ifdef CONFIG_NUMA +#ifdef CONFIG_HIGHMEM extern void set_highmem_pages_init(void); +#else +static inline void set_highmem_pages_init(void) +{ +} #endif #endif /* _ASM_X86_NUMA_32_H */ diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index bcc079c282d..13a823cf564 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -1,5 +1,6 @@ #include #include +#include /* for totalram_pages */ void *kmap(struct page *page) { @@ -156,3 +157,36 @@ EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); + +#ifdef CONFIG_NUMA +void __init set_highmem_pages_init(void) +{ + struct zone *zone; + int nid; + + for_each_zone(zone) { + unsigned long zone_start_pfn, zone_end_pfn; + + if (!is_highmem(zone)) + continue; + + zone_start_pfn = zone->zone_start_pfn; + zone_end_pfn = zone_start_pfn + zone->spanned_pages; + + nid = zone_to_nid(zone); + printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn); + } + totalram_pages += totalhigh_pages; +} +#else +static void __init set_highmem_pages_init(void) +{ + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); + + totalram_pages += totalhigh_pages; +} +#endif /* CONFIG_NUMA */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cd8d6732613..0b087dcd2c1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -467,22 +467,10 @@ void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, work_with_active_regions(nid, add_highpages_work_fn, &data); } -#ifndef CONFIG_NUMA -static void __init set_highmem_pages_init(void) -{ - add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); - - totalram_pages += totalhigh_pages; -} -#endif /* !CONFIG_NUMA */ - #else static inline void permanent_kmaps_init(pgd_t *pgd_base) { } -static inline void set_highmem_pages_init(void) -{ -} #endif /* CONFIG_HIGHMEM */ void __init native_pagetable_setup_start(pgd_t *base) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index d1f7439d173..a04092a8acc 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -423,32 +423,6 @@ void __init initmem_init(unsigned long start_pfn, setup_bootmem_allocator(); } -void __init set_highmem_pages_init(void) -{ -#ifdef CONFIG_HIGHMEM - struct zone *zone; - int nid; - - for_each_zone(zone) { - unsigned long zone_start_pfn, zone_end_pfn; - - if (!is_highmem(zone)) - continue; - - zone_start_pfn = zone->zone_start_pfn; - zone_end_pfn = zone_start_pfn + zone->spanned_pages; - - nid = zone_to_nid(zone); - printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", - zone->name, nid, zone_start_pfn, zone_end_pfn); - - add_highpages_with_active_regions(nid, zone_start_pfn, - zone_end_pfn); - } - totalram_pages += totalhigh_pages; -#endif -} - #ifdef CONFIG_MEMORY_HOTPLUG static int paddr_to_nid(u64 addr) { -- cgit v1.2.3-70-g09d2 From 03787ceed8f7bf06af29f3b213017d89f8e9423d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 3 Mar 2009 15:32:24 +0100 Subject: x86: set_highmem_pages_init() cleanup, fix !CONFIG_NUMA && CONFIG_HIGHMEM=y Impact: build fix arch/x86/mm/highmem_32.c:187: error: static declaration of 'set_highmem_pages_init' follows non-static declaration arch/x86/include/asm/numa_32.h:8: error: previous declaration of 'set_highmem_pages_init' was here Signed-off-by: Pekka Enberg LKML-Reference: <1236082212.2675.24.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/highmem_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 13a823cf564..00f127c80b0 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -183,7 +183,7 @@ void __init set_highmem_pages_init(void) totalram_pages += totalhigh_pages; } #else -static void __init set_highmem_pages_init(void) +void __init set_highmem_pages_init(void) { add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); -- cgit v1.2.3-70-g09d2 From f254f3909efaf59ca2d0f408de2d044dace60706 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 3 Mar 2009 12:02:57 -0800 Subject: x86: un-__init fill_pud/pmd/pte They are used by __set_fixmap->set_pte_vaddr_pud, which can be used by arch_setup_additional_pages(), and so is used after init. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 11981fc8570..07f44d491df 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -168,7 +168,7 @@ static __ref void *spp_getpage(void) return ptr; } -static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr) +static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) { if (pgd_none(*pgd)) { pud_t *pud = (pud_t *)spp_getpage(); @@ -180,7 +180,7 @@ static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr) return pud_offset(pgd, vaddr); } -static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr) +static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) { if (pud_none(*pud)) { pmd_t *pmd = (pmd_t *) spp_getpage(); @@ -192,7 +192,7 @@ static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr) return pmd_offset(pud, vaddr); } -static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr) +static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) { if (pmd_none(*pmd)) { pte_t *pte = (pte_t *) spp_getpage(); -- cgit v1.2.3-70-g09d2