From 7c9f8861e6c9c839f913e49b98c3854daca18f27 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 22 Apr 2008 16:38:23 -0500 Subject: stackprotector: use canary at end of stack to indicate overruns at oops time (Updated with a common max-stack-used checker that knows about the canary, as suggested by Joe Perches) Use a canary at the end of the stack to clearly indicate at oops time whether the stack has ever overflowed. This is a very simple implementation with a couple of drawbacks: 1) a thread may legitimately use exactly up to the last word on the stack -- but the chances of doing this and then oopsing later seem slim 2) it's possible that the stack usage isn't dense enough that the canary location could get skipped over -- but the worst that happens is that we don't flag the overrun -- though this happens fairly often in my testing :( With the code in place, an intentionally-bloated stack oops might do: BUG: unable to handle kernel paging request at ffff8103f84cc680 IP: [] update_curr+0x9a/0xa8 PGD 8063 PUD 0 Thread overran stack or stack corrupted Oops: 0000 [1] SMP CPU 0 ... ... unless the stack overrun is so bad that it corrupts some other thread. Signed-off-by: Eric Sandeen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/fault.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fd7e1798c75..1f524df68b9 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -581,6 +582,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) unsigned long address; int write, si_code; int fault; + unsigned long *stackend; + #ifdef CONFIG_X86_64 unsigned long flags; #endif @@ -850,6 +853,10 @@ no_context: show_fault_oops(regs, error_code, address); + stackend = end_of_stack(tsk); + if (*stackend != STACK_END_MAGIC) + printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); + tsk->thread.cr2 = address; tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; -- cgit v1.2.3-70-g09d2 From 92181f190b649f7ef2b79cbf5c00f26ccc66da2a Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 20 Jan 2009 04:24:26 +0100 Subject: x86: optimise x86's do_page_fault (C entry point for the page fault path) Impact: cleanup, restructure code to improve assembly gcc isn't _all_ that smart about spilling registers to stack or reusing stack slots, even with branch annotations. do_page_fault contained a lot of functionality, so split unlikely paths into their own functions, and mark them as noinline just to be sure. I consider this actually to be somewhat of a cleanup too: the main function now contains about half the number of lines so the normal path is easier to read, while the error cases are also nicely split away. Also, ensure the order of arguments to functions is always the same: regs, addr, error_code. This can reduce code size a tiny bit, and just looks neater too. And add a couple of branch annotations. Before: do_page_fault: subq $360, %rsp #, After: do_page_fault: subq $56, %rsp #, bloat-o-meter: add/remove: 8/0 grow/shrink: 0/1 up/down: 2222/-1680 (542) function old new delta __bad_area_nosemaphore - 506 +506 no_context - 474 +474 vmalloc_fault - 424 +424 spurious_fault - 358 +358 mm_fault_error - 272 +272 bad_area_access_error - 89 +89 bad_area - 89 +89 bad_area_nosemaphore - 10 +10 do_page_fault 2464 784 -1680 Yes, the total size increases by 542 bytes, due to the extra function calls. But these will very rarely be called (except for vmalloc_fault) in a normal workload. Importantly, do_page_fault is less than 1/3rd it's original size, and touches far less stack. Existing gotos and branch hints did move a lot of the infrequently used text out of the fastpath, but that's even further improved after this patch. Signed-off-by: Nick Piggin Acked-by: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 438 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 256 insertions(+), 182 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 90dfae511a4..033292dc9e2 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -91,8 +91,8 @@ static inline int notify_page_fault(struct pt_regs *regs) * * Opcode checker based on code by Richard Brunner */ -static int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) +static int is_prefetch(struct pt_regs *regs, unsigned long error_code, + unsigned long addr) { unsigned char *instr; int scan_more = 1; @@ -409,15 +409,15 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, } #ifdef CONFIG_X86_64 -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, - unsigned long error_code) +static noinline void pgtable_bad(struct pt_regs *regs, + unsigned long error_code, unsigned long address) { unsigned long flags = oops_begin(); int sig = SIGKILL; - struct task_struct *tsk; + struct task_struct *tsk = current; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", - current->comm, address); + tsk->comm, address); dump_pagetable(address); tsk = current; tsk->thread.cr2 = address; @@ -429,6 +429,190 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, } #endif +static noinline void no_context(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + struct task_struct *tsk = current; +#ifdef CONFIG_X86_64 + unsigned long flags; + int sig; +#endif + + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * X86_32 + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * X86_64 + * Hall of shame of CPU/BIOS bugs. + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata93(regs, address)) + return; + + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ +#ifdef CONFIG_X86_32 + bust_spinlocks(1); +#else + flags = oops_begin(); +#endif + + show_fault_oops(regs, error_code, address); + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + +#ifdef CONFIG_X86_32 + die("Oops", regs, error_code); + bust_spinlocks(0); + do_exit(SIGKILL); +#else + sig = SIGKILL; + if (__die("Oops", regs, error_code)) + sig = 0; + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags, regs, sig); +#endif +} + +static void __bad_area_nosemaphore(struct pt_regs *regs, + unsigned long error_code, unsigned long address, + int si_code) +{ + struct task_struct *tsk = current; + + /* User mode accesses just cause a SIGSEGV */ + if (error_code & PF_USER) { + /* + * It's possible to have interrupts off here. + */ + local_irq_enable(); + + /* + * Valid to do another page fault here because this one came + * from user space. + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata100(regs, address)) + return; + + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk( + "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *) regs->ip, (void *) regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } + + tsk->thread.cr2 = address; + /* Kernel addresses are always protection faults */ + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGSEGV, si_code, address, tsk); + return; + } + + if (is_f00f_bug(regs, address)) + return; + + no_context(regs, error_code, address); +} + +static noinline void bad_area_nosemaphore(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); +} + +static void __bad_area(struct pt_regs *regs, + unsigned long error_code, unsigned long address, + int si_code) +{ + struct mm_struct *mm = current->mm; + + /* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ + up_read(&mm->mmap_sem); + + __bad_area_nosemaphore(regs, error_code, address, si_code); +} + +static noinline void bad_area(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + __bad_area(regs, error_code, address, SEGV_MAPERR); +} + +static noinline void bad_area_access_error(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + __bad_area(regs, error_code, address, SEGV_ACCERR); +} + +/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ +static void out_of_memory(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + /* + * We ran out of memory, call the OOM killer, and return the userspace + * (which will retry the fault, or kill us if we got oom-killed). + */ + up_read(¤t->mm->mmap_sem); + pagefault_out_of_memory(); +} + +static void do_sigbus(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die */ + if (!(error_code & PF_USER)) + no_context(regs, error_code, address); +#ifdef CONFIG_X86_32 + /* User space => ok to do another page fault */ + if (is_prefetch(regs, error_code, address)) + return; +#endif + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); +} + +static noinline void mm_fault_error(struct pt_regs *regs, + unsigned long error_code, unsigned long address, unsigned int fault) +{ + if (fault & VM_FAULT_OOM) + out_of_memory(regs, error_code, address); + else if (fault & VM_FAULT_SIGBUS) + do_sigbus(regs, error_code, address); + else + BUG(); +} + static int spurious_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & PF_WRITE) && !pte_write(*pte)) @@ -448,8 +632,8 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static int spurious_fault(unsigned long address, - unsigned long error_code) +static noinline int spurious_fault(unsigned long error_code, + unsigned long address) { pgd_t *pgd; pud_t *pud; @@ -494,7 +678,7 @@ static int spurious_fault(unsigned long address, * * This assumes no large pages in there. */ -static int vmalloc_fault(unsigned long address) +static noinline int vmalloc_fault(unsigned long address) { #ifdef CONFIG_X86_32 unsigned long pgd_paddr; @@ -573,6 +757,25 @@ static int vmalloc_fault(unsigned long address) int show_unhandled_signals = 1; +static inline int access_error(unsigned long error_code, int write, + struct vm_area_struct *vma) +{ + if (write) { + /* write, present and write, not present */ + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return 1; + } else if (unlikely(error_code & PF_PROT)) { + /* read, present */ + return 1; + } else { + /* read, not present */ + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return 1; + } + + return 0; +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -583,16 +786,12 @@ asmlinkage #endif void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) { + unsigned long address; struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; - unsigned long address; - int write, si_code; + int write; int fault; -#ifdef CONFIG_X86_64 - unsigned long flags; - int sig; -#endif tsk = current; mm = tsk->mm; @@ -601,9 +800,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* get the address */ address = read_cr2(); - si_code = SEGV_MAPERR; - - if (notify_page_fault(regs)) + if (unlikely(notify_page_fault(regs))) return; if (unlikely(kmmio_fault(regs, address))) return; @@ -631,17 +828,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) return; /* Can handle a stale RO->RW TLB */ - if (spurious_fault(address, error_code)) + if (spurious_fault(error_code, address)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock. */ - goto bad_area_nosemaphore; + bad_area_nosemaphore(regs, error_code, address); + return; } - /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. @@ -657,15 +854,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) #ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) - pgtable_bad(address, regs, error_code); + pgtable_bad(regs, error_code, address); #endif /* * If we're in an interrupt, have no user context or are running in an * atomic region then we must not take the fault. */ - if (unlikely(in_atomic() || !mm)) - goto bad_area_nosemaphore; + if (unlikely(in_atomic() || !mm)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } /* * When running in the kernel we expect faults to occur only to @@ -683,20 +882,26 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * source. If this is invalid we can skip the address space check, * thus avoiding the deadlock. */ - if (!down_read_trylock(&mm->mmap_sem)) { + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && - !search_exception_tables(regs->ip)) - goto bad_area_nosemaphore; + !search_exception_tables(regs->ip)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } down_read(&mm->mmap_sem); } vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) + if (unlikely(!vma)) { + bad_area(regs, error_code, address); + return; + } + if (likely(vma->vm_start <= address)) goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { + bad_area(regs, error_code, address); + return; + } if (error_code & PF_USER) { /* * Accessing the stack below %sp is always a bug. @@ -704,31 +909,25 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * and pusha to work. ("enter $65535,$31" pushes * 32 pointers and then decrements %sp by 65535.) */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) - goto bad_area; + if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { + bad_area(regs, error_code, address); + return; + } } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ + if (unlikely(expand_stack(vma, address))) { + bad_area(regs, error_code, address); + return; + } + + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ good_area: - si_code = SEGV_ACCERR; - write = 0; - switch (error_code & (PF_PROT|PF_WRITE)) { - default: /* 3: write, present */ - /* fall through */ - case PF_WRITE: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case PF_PROT: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; + write = error_code & PF_WRITE; + if (unlikely(access_error(error_code, write, vma))) { + bad_area_access_error(regs, error_code, address); + return; } /* @@ -738,11 +937,8 @@ good_area: */ fault = handle_mm_fault(mm, vma, address, write); if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); + mm_fault_error(regs, error_code, address, fault); + return; } if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; @@ -760,128 +956,6 @@ good_area: } #endif up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { - /* - * It's possible to have interrupts off here. - */ - local_irq_enable(); - - /* - * Valid to do another page fault here because this one came - * from user space. - */ - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata100(regs, address)) - return; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, - (void *) regs->ip, (void *) regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); - return; - } - - if (is_f00f_bug(regs, address)) - return; - -no_context: - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) - return; - - /* - * X86_32 - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. - * - * X86_64 - * Hall of shame of CPU/BIOS bugs. - */ - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata93(regs, address)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ -#ifdef CONFIG_X86_32 - bust_spinlocks(1); -#else - flags = oops_begin(); -#endif - - show_fault_oops(regs, error_code, address); - - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - -#ifdef CONFIG_X86_32 - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); -#else - sig = SIGKILL; - if (__die("Oops", regs, error_code)) - sig = 0; - /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags, regs, sig); -#endif - -out_of_memory: - /* - * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed). - */ - up_read(&mm->mmap_sem); - pagefault_out_of_memory(); - return; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & PF_USER)) - goto no_context; -#ifdef CONFIG_X86_32 - /* User space => ok to do another page fault */ - if (is_prefetch(regs, address, error_code)) - return; -#endif - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } DEFINE_SPINLOCK(pgd_lock); -- cgit v1.2.3-70-g09d2 From fb746d0e1365b7472ccc4c3d5b0672b34a092d0b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 21 Jan 2009 20:45:41 +0100 Subject: x86: optimise page fault entry, cleanup tsk is already assigned to current, drop the redundant second assignment. Signed-off-by: Johannes Weiner Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 033292dc9e2..8f4b859a04b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -419,7 +419,6 @@ static noinline void pgtable_bad(struct pt_regs *regs, printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); - tsk = current; tsk->thread.cr2 = address; tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; -- cgit v1.2.3-70-g09d2 From 010060741ad35eacb504414bc6fb9bb575b15f62 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Jan 2009 16:02:12 +0100 Subject: x86: add might_sleep() to do_page_fault() Impact: widen debug checks VirtualBox calls do_page_fault() from an atomic context but runs into a might_sleep() way pas this point, cure that. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8f4b859a04b..eb4d7fe0593 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -888,6 +888,12 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) return; } down_read(&mm->mmap_sem); + } else { + /* + * The above down_read_trylock() might have succeeded in which + * case we'll have missed the might_sleep() from down_read(). + */ + might_sleep(); } vma = find_vma(mm, address); -- cgit v1.2.3-70-g09d2 From 0973a06cde8cc1522fbcf2baacb926f1ee3f4c79 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 4 Feb 2009 15:24:09 -0800 Subject: x86: mm: introduce helper function in fault.c Impact: cleanup Introduce helper function fault_in_kernel_address() to make editors happy. Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/mm/fault.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index eb4d7fe0593..8e9b0f1fd87 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -775,6 +775,15 @@ static inline int access_error(unsigned long error_code, int write, return 0; } +static int fault_in_kernel_space(unsigned long address) +{ +#ifdef CONFIG_X86_32 + return address >= TASK_SIZE; +#else /* !CONFIG_X86_32 */ + return address >= TASK_SIZE64; +#endif /* CONFIG_X86_32 */ +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -817,11 +826,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * (error_code & 4) == 0, and that the fault was not a * protection error (error_code & 9) == 0. */ -#ifdef CONFIG_X86_32 - if (unlikely(address >= TASK_SIZE)) { -#else - if (unlikely(address >= TASK_SIZE64)) { -#endif + if (unlikely(fault_in_kernel_space(address))) { if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && vmalloc_fault(address) >= 0) return; -- cgit v1.2.3-70-g09d2 From 3c3e5694add02e665bbbd0fecfbbdcc0b903097a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Feb 2009 11:46:36 -0500 Subject: x86: check PMD in spurious_fault handler Impact: fix to prevent hard lockup on bad PMD permissions If the PMD does not have the correct permissions for a page access, but the PTE does, the spurious fault handler will mistake the fault as a lazy TLB transaction. This will result in an infinite loop of: fault -> spurious_fault check (pass) -> return to code -> fault This patch adds a check and a warn on if the PTE passes the permissions but the PMD does not. [ Updated: Ingo Molnar suggested using WARN_ONCE with some text ] Signed-off-by: Steven Rostedt --- arch/x86/mm/fault.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c76ef1d701c..278d645d108 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -455,6 +455,7 @@ static int spurious_fault(unsigned long address, pud_t *pud; pmd_t *pmd; pte_t *pte; + int ret; /* Reserved-bit violation or user access to kernel space? */ if (error_code & (PF_USER | PF_RSVD)) @@ -482,7 +483,17 @@ static int spurious_fault(unsigned long address, if (!pte_present(*pte)) return 0; - return spurious_fault_check(error_code, pte); + ret = spurious_fault_check(error_code, pte); + if (!ret) + return 0; + + /* + * Make sure we have permissions in PMD + * If not, then there's a bug in the page tables. + */ + ret = spurious_fault_check(error_code, (pte_t *) pmd); + WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + return ret; } /* -- cgit v1.2.3-70-g09d2 From 2d4a71676f4d89418a0d53e60b89e8b804b390b2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 19:56:40 +0100 Subject: x86, mm: fault.c cleanup Impact: cleanup, no code changed Clean up various small details, which can be correctness checked automatically: - tidy up the include file section - eliminate unnecessary includes - introduce show_signal_msg() to clean up code flow - standardize the code flow - standardize comments and other style details - more cleanups, pointed out by checkpatch No code changed on either 32-bit nor 64-bit: arch/x86/mm/fault.o: text data bss dec hex filename 4632 32 24 4688 1250 fault.o.before 4632 32 24 4688 1250 fault.o.after the md5 changed due to a change in a single instruction: 2e8a8241e7f0d69706776a5a26c90bc0 fault.o.before.asm c5c3d36e725586eb74f0e10692f0193e fault.o.after.asm Because a __LINE__ reference in a WARN_ONCE() has changed. On 32-bit a few stack offsets changed - no code size difference nor any functionality difference. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 546 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 331 insertions(+), 215 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e4b9fc5001c..351d679bf97 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1,56 +1,59 @@ /* * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include /* For unblank_screen() */ +#include +#include #include #include -#include /* for max_low_pfn */ -#include -#include #include #include +#include +#include +#include +#include +#include +#include +#include #include +#include #include +#include +#include +#include +#include +#include +#include +#include + +#include -#include -#include -#include -#include -#include #include +#include +#include +#include #include -#include #include +#include /* - * Page fault error code bits - * bit 0 == 0 means no page found, 1 means protection fault - * bit 1 == 0 means read, 1 means write - * bit 2 == 0 means kernel, 1 means user-mode - * bit 3 == 1 means use of reserved bit detected - * bit 4 == 1 means fault was an instruction fetch + * Page fault error code bits: + * + * bit 0 == 0: no page found 1: protection fault + * bit 1 == 0: read access 1: write access + * bit 2 == 0: kernel-mode access 1: user-mode access + * bit 3 == 1: use of reserved bit detected + * bit 4 == 1: fault was an instruction fetch */ -#define PF_PROT (1<<0) -#define PF_WRITE (1<<1) -#define PF_USER (1<<2) -#define PF_RSVD (1<<3) -#define PF_INSTR (1<<4) +enum x86_pf_error_code { + + PF_PROT = 1 << 0, + PF_WRITE = 1 << 1, + PF_USER = 1 << 2, + PF_RSVD = 1 << 3, + PF_INSTR = 1 << 4, +}; static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { @@ -82,23 +85,27 @@ static inline int notify_page_fault(struct pt_regs *regs) } /* - * X86_32 - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. + * Prefetch quirks: + * + * 32-bit mode: + * + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. * - * X86_64 - * Sometimes the CPU reports invalid exceptions on prefetch. - * Check that here and ignore it. + * 64-bit mode: * - * Opcode checker based on code by Richard Brunner + * Sometimes the CPU reports invalid exceptions on prefetch. + * Check that here and ignore it. + * + * Opcode checker based on code by Richard Brunner. */ -static int is_prefetch(struct pt_regs *regs, unsigned long error_code, - unsigned long addr) +static int +is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { + unsigned char *max_instr; unsigned char *instr; int scan_more = 1; int prefetch = 0; - unsigned char *max_instr; /* * If it was a exec (instruction fetch) fault on NX page, then @@ -114,9 +121,9 @@ static int is_prefetch(struct pt_regs *regs, unsigned long error_code, return 0; while (scan_more && instr < max_instr) { - unsigned char opcode; unsigned char instr_hi; unsigned char instr_lo; + unsigned char opcode; if (probe_kernel_address(instr, opcode)) break; @@ -173,15 +180,17 @@ static int is_prefetch(struct pt_regs *regs, unsigned long error_code, return prefetch; } -static void force_sig_info_fault(int si_signo, int si_code, - unsigned long address, struct task_struct *tsk) +static void +force_sig_info_fault(int si_signo, int si_code, unsigned long address, + struct task_struct *tsk) { siginfo_t info; - info.si_signo = si_signo; - info.si_errno = 0; - info.si_code = si_code; - info.si_addr = (void __user *)address; + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + force_sig_info(si_signo, &info, tsk); } @@ -189,6 +198,7 @@ static void force_sig_info_fault(int si_signo, int si_code, static int bad_address(void *p) { unsigned long dummy; + return probe_kernel_address((unsigned long *)p, dummy); } #endif @@ -200,13 +210,14 @@ static void dump_pagetable(unsigned long address) page = read_cr3(); page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; + #ifdef CONFIG_X86_PAE printk("*pdpt = %016Lx ", page); if ((page >> PAGE_SHIFT) < max_low_pfn && page & _PAGE_PRESENT) { page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) - & (PTRS_PER_PMD - 1)]; + & (PTRS_PER_PMD - 1)]; printk(KERN_CONT "*pde = %016Lx ", page); page &= ~_PAGE_NX; } @@ -218,14 +229,15 @@ static void dump_pagetable(unsigned long address) * We must not directly access the pte in the highpte * case if the page table is located in highmem. * And let's rather not kmap-atomic the pte, just in case - * it's allocated already. + * it's allocated already: */ if ((page >> PAGE_SHIFT) < max_low_pfn && (page & _PAGE_PRESENT) && !(page & _PAGE_PSE)) { + page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; + & (PTRS_PER_PTE - 1)]; printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); } @@ -239,26 +251,38 @@ static void dump_pagetable(unsigned long address) pgd = (pgd_t *)read_cr3(); pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); - if (bad_address(pgd)) goto bad; + if (bad_address(pgd)) + goto bad; + printk("PGD %lx ", pgd_val(*pgd)); - if (!pgd_present(*pgd)) goto ret; + + if (!pgd_present(*pgd)) + goto out; pud = pud_offset(pgd, address); - if (bad_address(pud)) goto bad; + if (bad_address(pud)) + goto bad; + printk("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_large(*pud)) - goto ret; + goto out; pmd = pmd_offset(pud, address); - if (bad_address(pmd)) goto bad; + if (bad_address(pmd)) + goto bad; + printk("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; + if (!pmd_present(*pmd) || pmd_large(*pmd)) + goto out; pte = pte_offset_kernel(pmd, address); - if (bad_address(pte)) goto bad; + if (bad_address(pte)) + goto bad; + printk("PTE %lx", pte_val(*pte)); -ret: +out: printk("\n"); return; bad: @@ -285,7 +309,6 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) * and redundant with the set_pmd() on non-PAE. As would * set_pud. */ - pud = pud_offset(pgd, address); pud_k = pud_offset(pgd_k, address); if (!pud_present(*pud_k)) @@ -295,11 +318,14 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) pmd_k = pmd_offset(pud_k, address); if (!pmd_present(*pmd_k)) return NULL; + if (!pmd_present(*pmd)) { set_pmd(pmd, *pmd_k); arch_flush_lazy_mmu_mode(); - } else + } else { BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + } + return pmd_k; } #endif @@ -312,29 +338,37 @@ KERN_ERR "******* Please consider a BIOS update.\n" KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; #endif -/* Workaround for K8 erratum #93 & buggy BIOS. - BIOS SMM functions are required to use a specific workaround - to avoid corruption of the 64bit RIP register on C stepping K8. - A lot of BIOS that didn't get tested properly miss this. - The OS sees this as a page fault with the upper 32bits of RIP cleared. - Try to work around it here. - Note we only handle faults in kernel here. - Does nothing for X86_32 +/* + * Workaround for K8 erratum #93 & buggy BIOS. + * + * BIOS SMM functions are required to use a specific workaround + * to avoid corruption of the 64bit RIP register on C stepping K8. + * + * A lot of BIOS that didn't get tested properly miss this. + * + * The OS sees this as a page fault with the upper 32bits of RIP cleared. + * Try to work around it here. + * + * Note we only handle faults in kernel here. + * Does nothing on 32-bit. */ static int is_errata93(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - static int warned; + static int once; + if (address != regs->ip) return 0; + if ((address >> 32) != 0) return 0; + address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!warned) { + if (!once) { printk(errata93_warning); - warned = 1; + once = 1; } regs->ip = address; return 1; @@ -344,16 +378,17 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) } /* - * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal - * addresses >4GB. We catch this in the page fault handler because these - * addresses are not reachable. Just detect this case and return. Any code + * Work around K8 erratum #100 K8 in compat mode occasionally jumps + * to illegal addresses >4GB. + * + * We catch this in the page fault handler because these addresses + * are not reachable. Just detect this case and return. Any code * segment in LDT is compatibility mode. */ static int is_errata100(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && - (address >> 32)) + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) return 1; #endif return 0; @@ -363,8 +398,9 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG unsigned long nr; + /* - * Pentium F0 0F C7 C8 bug workaround. + * Pentium F0 0F C7 C8 bug workaround: */ if (boot_cpu_data.f00f_bug) { nr = (address - idt_descr.address) >> 3; @@ -378,8 +414,9 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } -static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static void +show_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { #ifdef CONFIG_X86_32 if (!oops_may_print()) @@ -389,12 +426,14 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, #ifdef CONFIG_X86_PAE if (error_code & PF_INSTR) { unsigned int level; + pte_t *pte = lookup_address(address, &level); - if (pte && pte_present(*pte) && !pte_exec(*pte)) + if (pte && pte_present(*pte) && !pte_exec(*pte)) { printk(KERN_CRIT "kernel tried to execute " "NX-protected page - exploit attempt? " "(uid: %d)\n", current_uid()); + } } #endif @@ -403,34 +442,45 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(KERN_CONT "NULL pointer dereference"); else printk(KERN_CONT "paging request"); + printk(KERN_CONT " at %p\n", (void *) address); printk(KERN_ALERT "IP:"); printk_address(regs->ip, 1); + dump_pagetable(address); } #ifdef CONFIG_X86_64 -static noinline void pgtable_bad(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +pgtable_bad(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { - unsigned long flags = oops_begin(); - int sig = SIGKILL; - struct task_struct *tsk = current; + struct task_struct *tsk; + unsigned long flags; + int sig; + + flags = oops_begin(); + tsk = current; + sig = SIGKILL; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + if (__die("Bad pagetable", regs, error_code)) sig = 0; + oops_end(flags, regs, sig); } #endif -static noinline void no_context(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +no_context(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { struct task_struct *tsk = current; unsigned long *stackend; @@ -440,18 +490,20 @@ static noinline void no_context(struct pt_regs *regs, int sig; #endif - /* Are we prepared to handle this kernel fault? */ + /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) return; /* - * X86_32 - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. + * 32-bit: + * + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * 64-bit: * - * X86_64 - * Hall of shame of CPU/BIOS bugs. + * Hall of shame of CPU/BIOS bugs. */ if (is_prefetch(regs, error_code, address)) return; @@ -461,7 +513,7 @@ static noinline void no_context(struct pt_regs *regs, /* * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. + * terminate things with extreme prejudice: */ #ifdef CONFIG_X86_32 bust_spinlocks(1); @@ -471,7 +523,7 @@ static noinline void no_context(struct pt_regs *regs, show_fault_oops(regs, error_code, address); - stackend = end_of_stack(tsk); + stackend = end_of_stack(tsk); if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); @@ -487,28 +539,54 @@ static noinline void no_context(struct pt_regs *regs, sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; + /* Executive summary in case the body of the oops scrolled away */ printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags, regs, sig); #endif } -static void __bad_area_nosemaphore(struct pt_regs *regs, - unsigned long error_code, unsigned long address, - int si_code) +/* + * Print out info about fatal segfaults, if the show_unhandled_signals + * sysctl is set: + */ +static inline void +show_signal_msg(struct pt_regs *regs, unsigned long error_code, + unsigned long address, struct task_struct *tsk) +{ + if (!unhandled_signal(tsk, SIGSEGV)) + return; + + if (!printk_ratelimit()) + return; + + printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *)regs->ip, (void *)regs->sp, error_code); + + print_vma_addr(KERN_CONT " in ", regs->ip); + + printk(KERN_CONT "\n"); +} + +static void +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) { struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ if (error_code & PF_USER) { /* - * It's possible to have interrupts off here. + * It's possible to have interrupts off here: */ local_irq_enable(); /* * Valid to do another page fault here because this one came - * from user space. + * from user space: */ if (is_prefetch(regs, error_code, address)) return; @@ -516,22 +594,16 @@ static void __bad_area_nosemaphore(struct pt_regs *regs, if (is_errata100(regs, address)) return; - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, - (void *) regs->ip, (void *) regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } + if (unlikely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); + + /* Kernel addresses are always protection faults: */ + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; force_sig_info_fault(SIGSEGV, si_code, address, tsk); + return; } @@ -541,15 +613,16 @@ static void __bad_area_nosemaphore(struct pt_regs *regs, no_context(regs, error_code, address); } -static noinline void bad_area_nosemaphore(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); } -static void __bad_area(struct pt_regs *regs, - unsigned long error_code, unsigned long address, - int si_code) +static void +__bad_area(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) { struct mm_struct *mm = current->mm; @@ -562,67 +635,77 @@ static void __bad_area(struct pt_regs *regs, __bad_area_nosemaphore(regs, error_code, address, si_code); } -static noinline void bad_area(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) { __bad_area(regs, error_code, address, SEGV_MAPERR); } -static noinline void bad_area_access_error(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area_access_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { __bad_area(regs, error_code, address, SEGV_ACCERR); } /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ -static void out_of_memory(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static void +out_of_memory(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { /* * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed). + * (which will retry the fault, or kill us if we got oom-killed): */ up_read(¤t->mm->mmap_sem); + pagefault_out_of_memory(); } -static void do_sigbus(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static void +do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; up_read(&mm->mmap_sem); - /* Kernel mode? Handle exceptions or die */ + /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) no_context(regs, error_code, address); + #ifdef CONFIG_X86_32 - /* User space => ok to do another page fault */ + /* User space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) return; #endif - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } -static noinline void mm_fault_error(struct pt_regs *regs, - unsigned long error_code, unsigned long address, unsigned int fault) +static noinline void +mm_fault_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address, unsigned int fault) { - if (fault & VM_FAULT_OOM) + if (fault & VM_FAULT_OOM) { out_of_memory(regs, error_code, address); - else if (fault & VM_FAULT_SIGBUS) - do_sigbus(regs, error_code, address); - else - BUG(); + } else { + if (fault & VM_FAULT_SIGBUS) + do_sigbus(regs, error_code, address); + else + BUG(); + } } static int spurious_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & PF_WRITE) && !pte_write(*pte)) return 0; + if ((error_code & PF_INSTR) && !pte_exec(*pte)) return 0; @@ -630,16 +713,19 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) } /* - * Handle a spurious fault caused by a stale TLB entry. This allows - * us to lazily refresh the TLB when increasing the permissions of a - * kernel page (RO -> RW or NX -> X). Doing it eagerly is very - * expensive since that implies doing a full cross-processor TLB - * flush, even if no stale TLB entries exist on other processors. + * Handle a spurious fault caused by a stale TLB entry. + * + * This allows us to lazily refresh the TLB when increasing the + * permissions of a kernel page (RO -> RW or NX -> X). Doing it + * eagerly is very expensive since that implies doing a full + * cross-processor TLB flush, even if no stale TLB entries exist + * on other processors. + * * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static noinline int spurious_fault(unsigned long error_code, - unsigned long address) +static noinline int +spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; pud_t *pud; @@ -678,20 +764,23 @@ static noinline int spurious_fault(unsigned long error_code, return 0; /* - * Make sure we have permissions in PMD - * If not, then there's a bug in the page tables. + * Make sure we have permissions in PMD. + * If not, then there's a bug in the page tables: */ ret = spurious_fault_check(error_code, (pte_t *) pmd); WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + return ret; } /* - * X86_32 - * Handle a fault on the vmalloc or module mapping area + * 32-bit: + * + * Handle a fault on the vmalloc or module mapping area * - * X86_64 - * Handle a fault on the vmalloc area + * 64-bit: + * + * Handle a fault on the vmalloc area * * This assumes no large pages in there. */ @@ -702,7 +791,7 @@ static noinline int vmalloc_fault(unsigned long address) pmd_t *pmd_k; pte_t *pte_k; - /* Make sure we are in vmalloc area */ + /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; @@ -717,9 +806,11 @@ static noinline int vmalloc_fault(unsigned long address) pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); if (!pmd_k) return -1; + pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) return -1; + return 0; #else pgd_t *pgd, *pgd_ref; @@ -727,69 +818,84 @@ static noinline int vmalloc_fault(unsigned long address) pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; - /* Make sure we are in vmalloc area */ + /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; - /* Copy kernel mappings over when needed. This can also - happen within a race in page table update. In the later - case just flush. */ - + /* + * Copy kernel mappings over when needed. This can also + * happen within a race in page table update. In the later + * case just flush: + */ pgd = pgd_offset(current->active_mm, address); pgd_ref = pgd_offset_k(address); if (pgd_none(*pgd_ref)) return -1; + if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); else BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - /* Below here mismatches are bugs because these lower tables - are shared */ + /* + * Below here mismatches are bugs because these lower tables + * are shared: + */ pud = pud_offset(pgd, address); pud_ref = pud_offset(pgd_ref, address); if (pud_none(*pud_ref)) return -1; + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) BUG(); + pmd = pmd_offset(pud, address); pmd_ref = pmd_offset(pud_ref, address); if (pmd_none(*pmd_ref)) return -1; + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) BUG(); + pte_ref = pte_offset_kernel(pmd_ref, address); if (!pte_present(*pte_ref)) return -1; + pte = pte_offset_kernel(pmd, address); - /* Don't use pte_page here, because the mappings can point - outside mem_map, and the NUMA hash lookup cannot handle - that. */ + + /* + * Don't use pte_page here, because the mappings can point + * outside mem_map, and the NUMA hash lookup cannot handle + * that: + */ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) BUG(); + return 0; #endif } int show_unhandled_signals = 1; -static inline int access_error(unsigned long error_code, int write, - struct vm_area_struct *vma) +static inline int +access_error(unsigned long error_code, int write, struct vm_area_struct *vma) { if (write) { - /* write, present and write, not present */ + /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; - } else if (unlikely(error_code & PF_PROT)) { - /* read, present */ - return 1; - } else { - /* read, not present */ - if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) - return 1; + return 0; } + /* read, present: */ + if (unlikely(error_code & PF_PROT)) + return 1; + + /* read, not present: */ + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return 1; + return 0; } @@ -797,9 +903,9 @@ static int fault_in_kernel_space(unsigned long address) { #ifdef CONFIG_X86_32 return address >= TASK_SIZE; -#else /* !CONFIG_X86_32 */ +#else return address >= TASK_SIZE64; -#endif /* CONFIG_X86_32 */ +#endif } /* @@ -812,18 +918,19 @@ asmlinkage #endif void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) { - unsigned long address; + struct vm_area_struct *vma; struct task_struct *tsk; + unsigned long address; struct mm_struct *mm; - struct vm_area_struct *vma; int write; int fault; tsk = current; mm = tsk->mm; + prefetchw(&mm->mmap_sem); - /* get the address */ + /* Get the faulting address: */ address = read_cr2(); if (unlikely(kmmio_fault(regs, address))) @@ -847,22 +954,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) vmalloc_fault(address) >= 0) return; - /* Can handle a stale RO->RW TLB */ + /* Can handle a stale RO->RW TLB: */ if (spurious_fault(error_code, address)) return; - /* kprobes don't want to hook the spurious faults. */ + /* kprobes don't want to hook the spurious faults: */ if (notify_page_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. + * fault we could otherwise deadlock: */ bad_area_nosemaphore(regs, error_code, address); + return; } - /* kprobes don't want to hook the spurious faults. */ + /* kprobes don't want to hook the spurious faults: */ if (unlikely(notify_page_fault(regs))) return; /* @@ -870,13 +978,15 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * vmalloc fault has been handled. * * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. + * potential system fault or CPU buglet: */ if (user_mode_vm(regs)) { local_irq_enable(); error_code |= PF_USER; - } else if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + } else { + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); + } #ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) @@ -884,8 +994,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) #endif /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault. + * If we're in an interrupt, have no user context or are running + * in an atomic region then we must not take the fault: */ if (unlikely(in_atomic() || !mm)) { bad_area_nosemaphore(regs, error_code, address); @@ -894,19 +1004,19 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* * When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. + * addresses in user space. All other faults represent errors in + * the kernel and should generate an OOPS. Unfortunately, in the + * case of an erroneous fault occurring in a code path which already + * holds mmap_sem we will deadlock attempting to validate the fault + * against the address space. Luckily the kernel only validly + * references user space from well defined areas of code, which are + * listed in the exceptions table. * * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. + * the source reference check when there is a possibility of a + * deadlock. Attempt to lock the address space, if we cannot we then + * validate the source. If this is invalid we can skip the address + * space check, thus avoiding the deadlock: */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && @@ -917,8 +1027,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) down_read(&mm->mmap_sem); } else { /* - * The above down_read_trylock() might have succeeded in which - * case we'll have missed the might_sleep() from down_read(). + * The above down_read_trylock() might have succeeded in + * which case we'll have missed the might_sleep() from + * down_read(): */ might_sleep(); } @@ -938,7 +1049,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535,$31" pushes + * and pusha to work. ("enter $65535, $31" pushes * 32 pointers and then decrements %sp by 65535.) */ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { @@ -957,6 +1068,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) */ good_area: write = error_code & PF_WRITE; + if (unlikely(access_error(error_code, write, vma))) { bad_area_access_error(regs, error_code, address); return; @@ -965,13 +1077,15 @@ good_area: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo - * the fault. + * the fault: */ fault = handle_mm_fault(mm, vma, address, write); + if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } + if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; else @@ -1004,13 +1118,13 @@ void vmalloc_sync_all(void) for (address = VMALLOC_START & PMD_MASK; address >= TASK_SIZE && address < FIXADDR_TOP; address += PMD_SIZE) { + unsigned long flags; struct page *page; spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), - address)) + if (!vmalloc_sync_one(page_address(page), address)) break; } spin_unlock_irqrestore(&pgd_lock, flags); @@ -1018,12 +1132,14 @@ void vmalloc_sync_all(void) #else /* CONFIG_X86_64 */ for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); unsigned long flags; struct page *page; if (pgd_none(*pgd_ref)) continue; + spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; -- cgit v1.2.3-70-g09d2 From 107a03678cac0dd6cf7095f81442a4fa477e4700 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 20:37:05 +0100 Subject: x86, mm: fault.c, refactor/simplify the is_prefetch() code Impact: no functionality changed Factor out the opcode checker into a helper inline. The code got a tiny bit smaller: text data bss dec hex filename 4632 32 24 4688 1250 fault.o.before 4618 32 24 4674 1242 fault.o.after And it got cleaner / easier to review as well. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 100 ++++++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 50 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 351d679bf97..72973c7682b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -99,12 +99,58 @@ static inline int notify_page_fault(struct pt_regs *regs) * * Opcode checker based on code by Richard Brunner. */ +static inline int +check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, + unsigned char opcode, int *prefetch) +{ + unsigned char instr_hi = opcode & 0xf0; + unsigned char instr_lo = opcode & 0x0f; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. + * In X86_64 long mode, the CPU will signal invalid + * opcode if some of these prefixes are present so + * X86_64 will never get here anyway + */ + return ((instr_lo & 7) == 0x6); +#ifdef CONFIG_X86_64 + case 0x40: + /* + * In AMD64 long mode 0x40..0x4F are valid REX prefixes + * Need to figure out under what instruction mode the + * instruction was issued. Could check the LDT for lm, + * but for now it's good enough to assume that long + * mode only uses well known segments or kernel. + */ + return (!user_mode(regs)) || (regs->cs == __USER_CS); +#endif + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + return (instr_lo & 0xC) == 0x4; + case 0xF0: + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ + return !instr_lo || (instr_lo>>1) == 1; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + if (probe_kernel_address(instr, opcode)) + return 0; + + *prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + return 0; + default: + return 0; + } +} + static int is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { unsigned char *max_instr; unsigned char *instr; - int scan_more = 1; int prefetch = 0; /* @@ -114,68 +160,22 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) if (error_code & PF_INSTR) return 0; - instr = (unsigned char *)convert_ip_to_linear(current, regs); + instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) return 0; - while (scan_more && instr < max_instr) { - unsigned char instr_hi; - unsigned char instr_lo; + while (instr < max_instr) { unsigned char opcode; if (probe_kernel_address(instr, opcode)) break; - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; instr++; - switch (instr_hi) { - case 0x20: - case 0x30: - /* - * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. - * In X86_64 long mode, the CPU will signal invalid - * opcode if some of these prefixes are present so - * X86_64 will never get here anyway - */ - scan_more = ((instr_lo & 7) == 0x6); - break; -#ifdef CONFIG_X86_64 - case 0x40: - /* - * In AMD64 long mode 0x40..0x4F are valid REX prefixes - * Need to figure out under what instruction mode the - * instruction was issued. Could check the LDT for lm, - * but for now it's good enough to assume that long - * mode only uses well known segments or kernel. - */ - scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); - break; -#endif - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; + if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; - case 0xF0: - /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - - if (probe_kernel_address(instr, opcode)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } } return prefetch; } -- cgit v1.2.3-70-g09d2 From 8c938f9fae887f6e180bf802aa1c33cf74712aff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:12:18 +0100 Subject: x86, mm: fault.c, factor out the vm86 fault check Impact: cleanup Instead of an ugly, open-coded, #ifdef-ed vm86 related legacy check in do_page_fault(), put it into the check_v8086_mode() helper function and merge it with an existing #ifdef. Also, simplify the code flow a tiny bit in the helper. No code changed: arch/x86/mm/fault.o: text data bss dec hex filename 2711 12 12 2735 aaf fault.o.before 2711 12 12 2735 aaf fault.o.after Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 72973c7682b..7dc0615c3cf 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -328,14 +328,41 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } -#endif -#ifdef CONFIG_X86_64 +/* + * Did it hit the DOS screen memory VA from vm86 mode? + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ + unsigned long bit; + + if (!v8086_mode(regs)) + return; + + bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; +} + +#else /* CONFIG_X86_64: */ + static const char errata93_warning[] = KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" KERN_ERR "******* Please consider a BIOS update.\n" KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; + +/* + * No vm86 mode in 64-bit mode: + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ +} + #endif /* @@ -1091,16 +1118,8 @@ good_area: else tsk->min_flt++; -#ifdef CONFIG_X86_32 - /* - * Did it hit the DOS screen memory VA from vm86 mode? - */ - if (v8086_mode(regs)) { - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; - } -#endif + check_v8086_mode(regs, address, tsk); + up_read(&mm->mmap_sem); } -- cgit v1.2.3-70-g09d2 From 121d5d0a7e5808fbcfda484efd7ba840ac93450f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:18:08 +0100 Subject: x86, mm: fault.c, enable PF_RSVD checks on 32-bit too Impact: improve page fault handling robustness The 'PF_RSVD' flag (bit 3) of the page-fault error_code is a relatively recent addition to x86 CPUs, so the 32-bit do_fault() implementation never had it. This flag gets set when the CPU detects nonzero values in any reserved bits of the page directory entries. Extend the existing 64-bit check for PF_RSVD in do_page_fault() to 32-bit too. If we detect such a fault then we print a more informative oops and the pagetables. This unifies the code some more, removes an ugly #ifdef and improves the 32-bit page fault code robustness a bit. It slightly increases the 32-bit kernel text size. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7dc0615c3cf..3e366146273 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -477,7 +477,6 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, dump_pagetable(address); } -#ifdef CONFIG_X86_64 static noinline void pgtable_bad(struct pt_regs *regs, unsigned long error_code, unsigned long address) @@ -503,7 +502,6 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, oops_end(flags, regs, sig); } -#endif static noinline void no_context(struct pt_regs *regs, unsigned long error_code, @@ -1015,10 +1013,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) local_irq_enable(); } -#ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); -#endif /* * If we're in an interrupt, have no user context or are running -- cgit v1.2.3-70-g09d2 From b814d41f0987c7648d7ed07471258101c95c026b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:32:10 +0100 Subject: x86, mm: fault.c, simplify kmmio_fault() Impact: cleanup Remove an #ifdef from kmmio_fault() - we can do this by providing default implementations for is_kmmio_active() and kmmio_handler(). The compiler optimizes it all away in the !CONFIG_MMIOTRACE case. Also, while at it, clean up mmiotrace.h a bit: - standard header guards - standard vertical spaces for structure definitions No code changed (both with mmiotrace on and off in the config): text data bss dec hex filename 2947 12 12 2971 b9b fault.o.before 2947 12 12 2971 b9b fault.o.after Cc: Pekka Paalanen Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 5 +-- include/linux/mmiotrace.h | 78 ++++++++++++++++++++++++++++------------------- 2 files changed, 50 insertions(+), 33 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3e366146273..fe99af4b86d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -55,13 +55,14 @@ enum x86_pf_error_code { PF_INSTR = 1 << 4, }; +/* + * (returns 0 if mmiotrace is disabled) + */ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -#ifdef CONFIG_MMIOTRACE if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; -#endif return 0; } diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 139d7c88d9c..3d1b7bde128 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -1,5 +1,5 @@ -#ifndef MMIOTRACE_H -#define MMIOTRACE_H +#ifndef _LINUX_MMIOTRACE_H +#define _LINUX_MMIOTRACE_H #include #include @@ -13,28 +13,34 @@ typedef void (*kmmio_post_handler_t)(struct kmmio_probe *, unsigned long condition, struct pt_regs *); struct kmmio_probe { - struct list_head list; /* kmmio internal list */ - unsigned long addr; /* start location of the probe point */ - unsigned long len; /* length of the probe region */ - kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */ - kmmio_post_handler_t post_handler; /* Called after addr is executed */ - void *private; + /* kmmio internal list: */ + struct list_head list; + /* start location of the probe point: */ + unsigned long addr; + /* length of the probe region: */ + unsigned long len; + /* Called before addr is executed: */ + kmmio_pre_handler_t pre_handler; + /* Called after addr is executed: */ + kmmio_post_handler_t post_handler; + void *private; }; +extern unsigned int kmmio_count; + +extern int register_kmmio_probe(struct kmmio_probe *p); +extern void unregister_kmmio_probe(struct kmmio_probe *p); + +#ifdef CONFIG_MMIOTRACE /* kmmio is active by some kmmio_probes? */ static inline int is_kmmio_active(void) { - extern unsigned int kmmio_count; return kmmio_count; } -extern int register_kmmio_probe(struct kmmio_probe *p); -extern void unregister_kmmio_probe(struct kmmio_probe *p); - /* Called from page fault handler. */ extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); -#ifdef CONFIG_MMIOTRACE /* Called from ioremap.c */ extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size, void __iomem *addr); @@ -43,7 +49,17 @@ extern void mmiotrace_iounmap(volatile void __iomem *addr); /* For anyone to insert markers. Remember trailing newline. */ extern int mmiotrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); -#else +#else /* !CONFIG_MMIOTRACE: */ +static inline int is_kmmio_active(void) +{ + return 0; +} + +static inline int kmmio_handler(struct pt_regs *regs, unsigned long addr) +{ + return 0; +} + static inline void mmiotrace_ioremap(resource_size_t offset, unsigned long size, void __iomem *addr) { @@ -63,28 +79,28 @@ static inline int mmiotrace_printk(const char *fmt, ...) #endif /* CONFIG_MMIOTRACE */ enum mm_io_opcode { - MMIO_READ = 0x1, /* struct mmiotrace_rw */ - MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ - MMIO_PROBE = 0x3, /* struct mmiotrace_map */ - MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ - MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */ + MMIO_READ = 0x1, /* struct mmiotrace_rw */ + MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ + MMIO_PROBE = 0x3, /* struct mmiotrace_map */ + MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ + MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */ }; struct mmiotrace_rw { - resource_size_t phys; /* PCI address of register */ - unsigned long value; - unsigned long pc; /* optional program counter */ - int map_id; - unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */ - unsigned char width; /* size of register access in bytes */ + resource_size_t phys; /* PCI address of register */ + unsigned long value; + unsigned long pc; /* optional program counter */ + int map_id; + unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */ + unsigned char width; /* size of register access in bytes */ }; struct mmiotrace_map { - resource_size_t phys; /* base address in PCI space */ - unsigned long virt; /* base virtual address */ - unsigned long len; /* mapping size */ - int map_id; - unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */ + resource_size_t phys; /* base address in PCI space */ + unsigned long virt; /* base virtual address */ + unsigned long len; /* mapping size */ + int map_id; + unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */ }; /* in kernel/trace/trace_mmiotrace.c */ @@ -94,4 +110,4 @@ extern void mmio_trace_rw(struct mmiotrace_rw *rw); extern void mmio_trace_mapping(struct mmiotrace_map *map); extern int mmio_trace_printk(const char *fmt, va_list args); -#endif /* MMIOTRACE_H */ +#endif /* _LINUX_MMIOTRACE_H */ -- cgit v1.2.3-70-g09d2 From b18018126f422f5b706fd750373425e10e84b486 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:42:57 +0100 Subject: x86, mm, kprobes: fault.c, simplify notify_page_fault() Impact: cleanup Remove an #ifdef from notify_page_fault(). The function still compiles to nothing in the !CONFIG_KPROBES case. Introduce kprobes_built_in() and kprobe_fault_handler() helpers to allow this - they returns 0 if !CONFIG_KPROBES. No code changed: text data bss dec hex filename 4618 32 24 4674 1242 fault.o.before 4618 32 24 4674 1242 fault.o.after Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 6 +----- include/linux/kprobes.h | 22 +++++++++++++++++++--- 2 files changed, 20 insertions(+), 8 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fe99af4b86d..379beaec6ca 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -68,11 +68,10 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) static inline int notify_page_fault(struct pt_regs *regs) { -#ifdef CONFIG_KPROBES int ret = 0; /* kprobe_running() needs smp_processor_id() */ - if (!user_mode_vm(regs)) { + if (kprobes_built_in() && !user_mode_vm(regs)) { preempt_disable(); if (kprobe_running() && kprobe_fault_handler(regs, 14)) ret = 1; @@ -80,9 +79,6 @@ static inline int notify_page_fault(struct pt_regs *regs) } return ret; -#else - return 0; -#endif } /* diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 32851eef48f..2ec6cc14a11 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -182,6 +182,14 @@ struct kprobe_blackpoint { DECLARE_PER_CPU(struct kprobe *, current_kprobe); DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); +/* + * For #ifdef avoidance: + */ +static inline int kprobes_built_in(void) +{ + return 1; +} + #ifdef CONFIG_KRETPROBES extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs); @@ -271,8 +279,16 @@ void unregister_kretprobes(struct kretprobe **rps, int num); void kprobe_flush_task(struct task_struct *tk); void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); -#else /* CONFIG_KPROBES */ +#else /* !CONFIG_KPROBES: */ +static inline int kprobes_built_in(void) +{ + return 0; +} +static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + return 0; +} static inline struct kprobe *get_kprobe(void *addr) { return NULL; @@ -329,5 +345,5 @@ static inline void unregister_kretprobes(struct kretprobe **rps, int num) static inline void kprobe_flush_task(struct task_struct *tk) { } -#endif /* CONFIG_KPROBES */ -#endif /* _LINUX_KPROBES_H */ +#endif /* CONFIG_KPROBES */ +#endif /* _LINUX_KPROBES_H */ -- cgit v1.2.3-70-g09d2 From f2f13a8535174dbb813a0607a9d4737cfba98f6c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:50:24 +0100 Subject: x86, mm: fault.c, reorder functions Impact: cleanup Avoid a couple more #ifdefs by moving fundamentally non-unifiable functions into a single #ifdef 32-bit / #else / #endif block in fault.c: vmalloc*(), dump_pagetable(), check_vm8086_mode(). No code changed: text data bss dec hex filename 4618 32 24 4674 1242 fault.o.before 4618 32 24 4674 1242 fault.o.after Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 474 ++++++++++++++++++++++++++-------------------------- 1 file changed, 239 insertions(+), 235 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 379beaec6ca..4ce62fb80da 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -191,18 +191,124 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, force_sig_info(si_signo, &info, tsk); } -#ifdef CONFIG_X86_64 -static int bad_address(void *p) +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +#ifdef CONFIG_X86_32 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { - unsigned long dummy; + unsigned index = pgd_index(address); + pgd_t *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; - return probe_kernel_address((unsigned long *)p, dummy); + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + return NULL; + + if (!pmd_present(*pmd)) { + set_pmd(pmd, *pmd_k); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + } + + return pmd_k; +} + +void vmalloc_sync_all(void) +{ + unsigned long address; + + if (SHARED_KERNEL_PMD) + return; + + for (address = VMALLOC_START & PMD_MASK; + address >= TASK_SIZE && address < FIXADDR_TOP; + address += PMD_SIZE) { + + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + if (!vmalloc_sync_one(page_address(page), address)) + break; + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* + * 32-bit: + * + * Handle a fault on the vmalloc or module mapping area + */ +static noinline int vmalloc_fault(unsigned long address) +{ + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + + return 0; +} + +/* + * Did it hit the DOS screen memory VA from vm86 mode? + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ + unsigned long bit; + + if (!v8086_mode(regs)) + return; + + bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; } -#endif static void dump_pagetable(unsigned long address) { -#ifdef CONFIG_X86_32 __typeof__(pte_val(__pte(0))) page; page = read_cr3(); @@ -239,7 +345,132 @@ static void dump_pagetable(unsigned long address) } printk("\n"); -#else /* CONFIG_X86_64 */ +} + +#else /* CONFIG_X86_64: */ + +void vmalloc_sync_all(void) +{ + unsigned long address; + + for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; + address += PGDIR_SIZE) { + + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* + * 64-bit: + * + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static noinline int vmalloc_fault(unsigned long address) +{ + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Copy kernel mappings over when needed. This can also + * happen within a race in page table update. In the later + * case just flush: + */ + pgd = pgd_offset(current->active_mm, address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + /* + * Below here mismatches are bugs because these lower tables + * are shared: + */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) + BUG(); + + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + + pte = pte_offset_kernel(pmd, address); + + /* + * Don't use pte_page here, because the mappings can point + * outside mem_map, and the NUMA hash lookup cannot handle + * that: + */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + + return 0; +} + +static const char errata93_warning[] = +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" +KERN_ERR "******* Please consider a BIOS update.\n" +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; + +/* + * No vm86 mode in 64-bit mode: + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ +} + +static int bad_address(void *p) +{ + unsigned long dummy; + + return probe_kernel_address((unsigned long *)p, dummy); +} + +static void dump_pagetable(unsigned long address) +{ pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -284,83 +515,9 @@ out: return; bad: printk("BAD\n"); -#endif -} - -#ifdef CONFIG_X86_32 -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) -{ - unsigned index = pgd_index(address); - pgd_t *pgd_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - - pgd += index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - return NULL; - - /* - * set_pgd(pgd, *pgd_k); here would be useless on PAE - * and redundant with the set_pmd() on non-PAE. As would - * set_pud. - */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); - if (!pud_present(*pud_k)) - return NULL; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - return NULL; - - if (!pmd_present(*pmd)) { - set_pmd(pmd, *pmd_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - } - - return pmd_k; -} - -/* - * Did it hit the DOS screen memory VA from vm86 mode? - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ - unsigned long bit; - - if (!v8086_mode(regs)) - return; - - bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; -} - -#else /* CONFIG_X86_64: */ - -static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; - -/* - * No vm86 mode in 64-bit mode: - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ } -#endif +#endif /* CONFIG_X86_64 */ /* * Workaround for K8 erratum #93 & buggy BIOS. @@ -795,109 +952,6 @@ spurious_fault(unsigned long error_code, unsigned long address) return ret; } -/* - * 32-bit: - * - * Handle a fault on the vmalloc or module mapping area - * - * 64-bit: - * - * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. - */ -static noinline int vmalloc_fault(unsigned long address) -{ -#ifdef CONFIG_X86_32 - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - - return 0; -#else - pgd_t *pgd, *pgd_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Copy kernel mappings over when needed. This can also - * happen within a race in page table update. In the later - * case just flush: - */ - pgd = pgd_offset(current->active_mm, address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) - return -1; - - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - - /* - * Below here mismatches are bugs because these lower tables - * are shared: - */ - - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); - if (pud_none(*pud_ref)) - return -1; - - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) - BUG(); - - pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) - return -1; - - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) - BUG(); - - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - - pte = pte_offset_kernel(pmd, address); - - /* - * Don't use pte_page here, because the mappings can point - * outside mem_map, and the NUMA hash lookup cannot handle - * that: - */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); - - return 0; -#endif -} - int show_unhandled_signals = 1; static inline int @@ -1115,53 +1169,3 @@ good_area: up_read(&mm->mmap_sem); } - -DEFINE_SPINLOCK(pgd_lock); -LIST_HEAD(pgd_list); - -void vmalloc_sync_all(void) -{ - unsigned long address; - -#ifdef CONFIG_X86_32 - if (SHARED_KERNEL_PMD) - return; - - for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE && address < FIXADDR_TOP; - address += PMD_SIZE) { - - unsigned long flags; - struct page *page; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), address)) - break; - } - spin_unlock_irqrestore(&pgd_lock, flags); - } -#else /* CONFIG_X86_64 */ - for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; - address += PGDIR_SIZE) { - - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - } -#endif -} -- cgit v1.2.3-70-g09d2 From 8f7661496cece8320137d5e26808825498fd2b26 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:00:29 +0100 Subject: x86, mm: fault.c, unify oops printing Impact: refine/extend page fault related oops printing on 64-bit - honor the pause_on_oops logic on 64-bit too - print out NX fault warnings on 64-bit as well - factor out the NX fault message to make it git-greppable and readable Note that this means that we do the PF_INSTR check on 32-bit non-PAE as well where it should not occur ... normally. Cannot hurt. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4ce62fb80da..ebfaca3bbb1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -595,28 +595,24 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } +static const char nx_warning[] = KERN_CRIT +"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; + static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { -#ifdef CONFIG_X86_32 if (!oops_may_print()) return; -#endif -#ifdef CONFIG_X86_PAE if (error_code & PF_INSTR) { unsigned int level; pte_t *pte = lookup_address(address, &level); - if (pte && pte_present(*pte) && !pte_exec(*pte)) { - printk(KERN_CRIT "kernel tried to execute " - "NX-protected page - exploit attempt? " - "(uid: %d)\n", current_uid()); - } + if (pte && pte_present(*pte) && !pte_exec(*pte)) + printk(nx_warning, current_uid()); } -#endif printk(KERN_ALERT "BUG: unable to handle kernel "); if (address < PAGE_SIZE) -- cgit v1.2.3-70-g09d2 From 1cc99544dde9e48602979f16b9309fade6e93051 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:07:48 +0100 Subject: x86, mm: fault.c, unify oops handling Impact: add oops-recursion check to 32-bit Unify the oops state-machine, to the 64-bit version. It is slightly more careful in that it does a recursion check in oops_begin(), and is thus more likely to show the relevant oops. It also means that 32-bit will print one more line at the end of pagefault triggered oopses: printk(KERN_EMERG "CR2: %016lx\n", address); Which is generally good information to be seen in partial-dump digital-camera jpegs ;-) The downside is the somewhat more complex critical path. Both variants have been tested well meanwhile by kernel developers crashing their boxes so i dont think this is a practical worry. This removes 3 ugly #ifdefs from no_context() and makes the function a lot nicer read. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ebfaca3bbb1..8fe2dd254df 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -659,11 +659,8 @@ no_context(struct pt_regs *regs, unsigned long error_code, { struct task_struct *tsk = current; unsigned long *stackend; - -#ifdef CONFIG_X86_64 unsigned long flags; int sig; -#endif /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) @@ -690,11 +687,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: */ -#ifdef CONFIG_X86_32 - bust_spinlocks(1); -#else flags = oops_begin(); -#endif show_fault_oops(regs, error_code, address); @@ -702,15 +695,10 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; -#ifdef CONFIG_X86_32 - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); -#else sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; @@ -719,7 +707,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, printk(KERN_EMERG "CR2: %016lx\n", address); oops_end(flags, regs, sig); -#endif } /* -- cgit v1.2.3-70-g09d2 From c3731c68668325abddee8665018c74c7156a57be Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:22:34 +0100 Subject: x86, mm: fault.c, remove #ifdef from do_page_fault() Impact: cleanup do_page_fault() has this ugly #ifdef in its prototype: #ifdef CONFIG_X86_64 asmlinkage #endif void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) Replace it with 'dotraplinkage' which maps to exactly the above construct: nothing on 32-bit and asmlinkage on 64-bit. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8fe2dd254df..9c2dc5d7953 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -972,10 +972,8 @@ static int fault_in_kernel_space(unsigned long address) * and the problem, and then passes it off to one of the appropriate * routines. */ -#ifdef CONFIG_X86_64 -asmlinkage -#endif -void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) +dotraplinkage void __kprobes +do_page_fault(struct pt_regs *regs, unsigned long error_code) { struct vm_area_struct *vma; struct task_struct *tsk; -- cgit v1.2.3-70-g09d2 From d951734654f76a370a89b4e531af9b765bd13541 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:32:28 +0100 Subject: x86, mm: rename TASK_SIZE64 => TASK_SIZE_MAX Impact: cleanup Rename TASK_SIZE64 to TASK_SIZE_MAX, and provide the define on 32-bit too. (mapped to TASK_SIZE) This allows 32-bit code to make use of the (former-) TASK_SIZE64 symbol as well, in a clean way. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 9 +++++---- arch/x86/kernel/ptrace.c | 2 +- arch/x86/mm/fault.c | 2 +- arch/x86/vdso/vma.c | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 72914d0315e..c7a98f73821 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -861,6 +861,7 @@ static inline void spin_lock_prefetch(const void *x) * User space process size: 3GB (default). */ #define TASK_SIZE PAGE_OFFSET +#define TASK_SIZE_MAX TASK_SIZE #define STACK_TOP TASK_SIZE #define STACK_TOP_MAX STACK_TOP @@ -920,7 +921,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); /* * User space process size. 47bits minus one guard page. */ -#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE) +#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. @@ -929,12 +930,12 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); 0xc0000000 : 0xFFFFe000) #define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ - IA32_PAGE_OFFSET : TASK_SIZE64) + IA32_PAGE_OFFSET : TASK_SIZE_MAX) #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ - IA32_PAGE_OFFSET : TASK_SIZE64) + IA32_PAGE_OFFSET : TASK_SIZE_MAX) #define STACK_TOP TASK_SIZE -#define STACK_TOP_MAX TASK_SIZE64 +#define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index d2f7cd5b2c8..fb2159a5c81 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -268,7 +268,7 @@ static unsigned long debugreg_addr_limit(struct task_struct *task) if (test_tsk_thread_flag(task, TIF_IA32)) return IA32_PAGE_OFFSET - 3; #endif - return TASK_SIZE64 - 7; + return TASK_SIZE_MAX - 7; } #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9c2dc5d7953..6fa9f175cba 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -963,7 +963,7 @@ static int fault_in_kernel_space(unsigned long address) #ifdef CONFIG_X86_32 return address >= TASK_SIZE; #else - return address >= TASK_SIZE64; + return address >= TASK_SIZE_MAX; #endif } diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 9c98cc6ba97..7133cdf9098 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -85,8 +85,8 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) unsigned long addr, end; unsigned offset; end = (start + PMD_SIZE - 1) & PMD_MASK; - if (end >= TASK_SIZE64) - end = TASK_SIZE64; + if (end >= TASK_SIZE_MAX) + end = TASK_SIZE_MAX; end -= len; /* This loses some more bits than a modulo, but is cheaper */ offset = get_random_int() & (PTRS_PER_PTE - 1); -- cgit v1.2.3-70-g09d2 From 7c178a26d3e753d2a4346d3e4b8aa549d387f698 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:27:29 +0100 Subject: x86, mm: fault.c, remove #ifdef from fault_in_kernel_space() Impact: cleanup Removal of an #ifdef in fault_in_kernel_space(), by making use of the new TASK_SIZE_MAX symbol which is now available on 32-bit too. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6fa9f175cba..f195691ec26 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -960,11 +960,7 @@ access_error(unsigned long error_code, int write, struct vm_area_struct *vma) static int fault_in_kernel_space(unsigned long address) { -#ifdef CONFIG_X86_32 - return address >= TASK_SIZE; -#else return address >= TASK_SIZE_MAX; -#endif } /* -- cgit v1.2.3-70-g09d2 From cd1b68f08f6328737928e5b8ba8ef80394680ff0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:39:02 +0100 Subject: x86, mm: fault.c, give another attempt at prefetch handing before SIGBUS Impact: extend prefetch handling on 64-bit Currently there's an extra is_prefetch() check done in do_sigbus(), which we only do on 32 bits. This is a last-ditch check before we terminate a task, so it's worth giving prefetch instructions another chance - should none of our existing quirks have caught a prefetch instruction related spurious fault. The only risk is if a prefetch causes a real sigbus, in that case we'll not OOM but try another fault. But this code has been on 32-bit for a long time, so it should be fine in practice. So do this on 64-bit too - and thus remove one more #ifdef. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f195691ec26..413e835e4a8 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -836,11 +836,9 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) if (!(error_code & PF_USER)) no_context(regs, error_code, address); -#ifdef CONFIG_X86_32 - /* User space => ok to do another page fault: */ + /* User-space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) return; -#endif tsk->thread.cr2 = address; tsk->thread.error_code = error_code; -- cgit v1.2.3-70-g09d2 From f8eeb2e6be367d79be3617f0a12646bced8b2fe6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:13:36 +0100 Subject: x86, mm: fault.c, update copyrights Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 413e835e4a8..9bb07d331c3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1,6 +1,7 @@ /* * Copyright (C) 1995 Linus Torvalds * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ #include #include -- cgit v1.2.3-70-g09d2 From b319eed0aa0a6d710887350a3cb734c572aa64c4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 10:24:18 +0100 Subject: x86, mm: fault.c, simplify kmmio_fault(), cleanup Clarify the kmmio_fault() comment. Acked-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9bb07d331c3..a03b7279efa 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -57,7 +57,8 @@ enum x86_pf_error_code { }; /* - * (returns 0 if mmiotrace is disabled) + * Returns 0 if mmiotrace is disabled, or if the fault is not + * handled by mmiotrace: */ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -- cgit v1.2.3-70-g09d2