From 4d7d8c82c181711d28c8336108330a9121f5ef07 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <76306.1226@compuserve.com>
Date: Thu, 23 Mar 2006 02:59:30 -0800
Subject: [PATCH] i386: multi-column stack backtraces

Print stack backtraces in multiple columns, saving screen space.  Number of
columns is configurable and defaults to one so behavior is
backwards-compatible.

Also removes the brackets around addresses when printing more
that one entry per line so they print as:
    <address>
instead of:
    [<address>]
This helps multiple entries fit better on one line.

Original idea by Dave Jones, taken from x86_64.

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/traps.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

(limited to 'arch/i386/kernel/traps.c')
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index b814dbdcc91..ee61988f61b 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -112,12 +112,30 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
 		p < (void *)tinfo + THREAD_SIZE - 3;
 }
 
-static void print_addr_and_symbol(unsigned long addr, char *log_lvl)
+/*
+ * Print CONFIG_STACK_BACKTRACE_COLS address/symbol entries per line.
+ */
+static inline int print_addr_and_symbol(unsigned long addr, char *log_lvl,
+					int printed)
 {
-	printk(log_lvl);
+	if (!printed)
+		printk(log_lvl);
+
+#if CONFIG_STACK_BACKTRACE_COLS == 1
 	printk(" [<%08lx>] ", addr);
+#else
+	printk(" <%08lx> ", addr);
+#endif
 	print_symbol("%s", addr);
-	printk("\n");
+
+	printed = (printed + 1) % CONFIG_STACK_BACKTRACE_COLS;
+
+	if (printed)
+		printk("  ");
+	else
+		printk("\n");
+
+	return printed;
 }
 
 static inline unsigned long print_context_stack(struct thread_info *tinfo,
@@ -125,20 +143,24 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
 				char *log_lvl)
 {
 	unsigned long addr;
+	int printed = 0; /* nr of entries already printed on current line */
 
 #ifdef	CONFIG_FRAME_POINTER
 	while (valid_stack_ptr(tinfo, (void *)ebp)) {
 		addr = *(unsigned long *)(ebp + 4);
-		print_addr_and_symbol(addr, log_lvl);
+		printed = print_addr_and_symbol(addr, log_lvl, printed);
 		ebp = *(unsigned long *)ebp;
 	}
 #else
 	while (valid_stack_ptr(tinfo, stack)) {
 		addr = *stack++;
 		if (__kernel_text_address(addr))
-			print_addr_and_symbol(addr, log_lvl);
+			printed = print_addr_and_symbol(addr, log_lvl, printed);
 	}
 #endif
+	if (printed)
+		printk("\n");
+
 	return ebp;
 }
 
-- 
cgit v1.2.3-70-g09d2


From cc04ee9cc527e314e6906849ee931c33764f861e Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Thu, 23 Mar 2006 02:59:38 -0800
Subject: [PATCH] i386 traps: merge printk calls

Merge a few printk calls in i386 traps.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/traps.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/i386/kernel/traps.c')

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index ee61988f61b..f20797b8da1 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -188,8 +188,7 @@ static void show_trace_log_lvl(struct task_struct *task,
 		stack = (unsigned long*)context->previous_esp;
 		if (!stack)
 			break;
-		printk(log_lvl);
-		printk(" =======================\n");
+		printk("%s =======================\n", log_lvl);
 	}
 }
 
@@ -218,14 +217,12 @@ static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp,
 			break;
 		if (i && ((i % 8) == 0)) {
 			printk("\n");
-			printk(log_lvl);
-			printk("       ");
+			printk("%s       ", log_lvl);
 		}
 		printk("%08lx ", *stack++);
 	}
 	printk("\n");
-	printk(log_lvl);
-	printk("Call Trace:\n");
+	printk("%sCall Trace:\n", log_lvl);
 	show_trace_log_lvl(task, esp, log_lvl);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 101f12af16fb12f8da8100899a13ee1b1b576a0a Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 23 Mar 2006 02:59:45 -0800
Subject: [PATCH] i386: actively synchronize vmalloc area when registering
 certain callbacks

Registering a callback handler through register_die_notifier() is obviously
primarily intended for use by modules.  However, the way these currently
get called it is basically impossible for them to actually be used by
modules, as there is, on non-PAE configurationes, a good chance (the larger
the module, the better) for the system to crash as a result.

This is because the callback gets invoked

(a) in the page fault path before the top level page table propagation
    gets carried out (hence a fault to propagate the top level page table
    entry/entries mapping to module's code/data would nest infinitly) and

(b) in the NMI path, where nested faults must absolutely not happen,
    since otherwise the IRET from the nested fault re-enables NMIs,
    potentially resulting in nested NMI occurences.

Besides the modular aspect, similar problems would even arise for in-
kernel consumers of the API if they touched ioremap()ed or vmalloc()ed
memory inside their handlers.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/traps.c          |   3 +
 arch/i386/mm/fault.c              | 173 +++++++++++++++++++++++++-------------
 include/asm-i386/pgtable-2level.h |   2 +
 include/asm-i386/pgtable-3level.h |   2 +
 4 files changed, 123 insertions(+), 57 deletions(-)

(limited to 'arch/i386/kernel/traps.c')

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index f20797b8da1..d510de7e4f2 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -99,6 +99,8 @@ int register_die_notifier(struct notifier_block *nb)
 {
 	int err = 0;
 	unsigned long flags;
+
+	vmalloc_sync_all();
 	spin_lock_irqsave(&die_notifier_lock, flags);
 	err = notifier_chain_register(&i386die_chain, nb);
 	spin_unlock_irqrestore(&die_notifier_lock, flags);
@@ -713,6 +715,7 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code)
 
 void set_nmi_callback(nmi_callback_t callback)
 {
+	vmalloc_sync_all();
 	rcu_assign_pointer(nmi_callback, callback);
 }
 EXPORT_SYMBOL_GPL(set_nmi_callback);
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index cf572d9a3b6..bbb24af5d86 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -214,6 +214,68 @@ static noinline void force_sig_info_fault(int si_signo, int si_code,
 
 fastcall void do_invalid_op(struct pt_regs *, unsigned long);
 
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+	unsigned index = pgd_index(address);
+	pgd_t *pgd_k;
+	pud_t *pud, *pud_k;
+	pmd_t *pmd, *pmd_k;
+
+	pgd += index;
+	pgd_k = init_mm.pgd + index;
+
+	if (!pgd_present(*pgd_k))
+		return NULL;
+
+	/*
+	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
+	 * and redundant with the set_pmd() on non-PAE. As would
+	 * set_pud.
+	 */
+
+	pud = pud_offset(pgd, address);
+	pud_k = pud_offset(pgd_k, address);
+	if (!pud_present(*pud_k))
+		return NULL;
+
+	pmd = pmd_offset(pud, address);
+	pmd_k = pmd_offset(pud_k, address);
+	if (!pmd_present(*pmd_k))
+		return NULL;
+	if (!pmd_present(*pmd))
+		set_pmd(pmd, *pmd_k);
+	else
+		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+	return pmd_k;
+}
+
+/*
+ * Handle a fault on the vmalloc or module mapping area
+ *
+ * This assumes no large pages in there.
+ */
+static inline int vmalloc_fault(unsigned long address)
+{
+	unsigned long pgd_paddr;
+	pmd_t *pmd_k;
+	pte_t *pte_k;
+	/*
+	 * Synchronize this task's top level page-table
+	 * with the 'reference' page table.
+	 *
+	 * Do _not_ use "current" here. We might be inside
+	 * an interrupt in the middle of a task switch..
+	 */
+	pgd_paddr = read_cr3();
+	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+	if (!pmd_k)
+		return -1;
+	pte_k = pte_offset_kernel(pmd_k, address);
+	if (!pte_present(*pte_k))
+		return -1;
+	return 0;
+}
+
 /*
  * This routine handles page faults.  It determines the address,
  * and the problem, and then passes it off to one of the appropriate
@@ -223,6 +285,8 @@ fastcall void do_invalid_op(struct pt_regs *, unsigned long);
  *	bit 0 == 0 means no page found, 1 means protection fault
  *	bit 1 == 0 means read, 1 means write
  *	bit 2 == 0 means kernel, 1 means user-mode
+ *	bit 3 == 1 means use of reserved bit detected
+ *	bit 4 == 1 means fault was an instruction fetch
  */
 fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 				      unsigned long error_code)
@@ -237,13 +301,6 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 	/* get the address */
         address = read_cr2();
 
-	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
-					SIGSEGV) == NOTIFY_STOP)
-		return;
-	/* It's safe to allow irq's after cr2 has been saved */
-	if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
-		local_irq_enable();
-
 	tsk = current;
 
 	si_code = SEGV_MAPERR;
@@ -259,17 +316,29 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 	 *
 	 * This verifies that the fault happens in kernel space
 	 * (error_code & 4) == 0, and that the fault was not a
-	 * protection error (error_code & 1) == 0.
+	 * protection error (error_code & 9) == 0.
 	 */
-	if (unlikely(address >= TASK_SIZE)) { 
-		if (!(error_code & 5))
-			goto vmalloc_fault;
-		/* 
+	if (unlikely(address >= TASK_SIZE)) {
+		if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
+			return;
+		if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
+						SIGSEGV) == NOTIFY_STOP)
+			return;
+		/*
 		 * Don't take the mm semaphore here. If we fixup a prefetch
 		 * fault we could otherwise deadlock.
 		 */
 		goto bad_area_nosemaphore;
-	} 
+	}
+
+	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
+					SIGSEGV) == NOTIFY_STOP)
+		return;
+
+	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
+	   fault has been handled. */
+	if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
+		local_irq_enable();
 
 	mm = tsk->mm;
 
@@ -510,51 +579,41 @@ do_sigbus:
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_no = 14;
 	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
-	return;
-
-vmalloc_fault:
-	{
-		/*
-		 * Synchronize this task's top level page-table
-		 * with the 'reference' page table.
-		 *
-		 * Do _not_ use "tsk" here. We might be inside
-		 * an interrupt in the middle of a task switch..
-		 */
-		int index = pgd_index(address);
-		unsigned long pgd_paddr;
-		pgd_t *pgd, *pgd_k;
-		pud_t *pud, *pud_k;
-		pmd_t *pmd, *pmd_k;
-		pte_t *pte_k;
-
-		pgd_paddr = read_cr3();
-		pgd = index + (pgd_t *)__va(pgd_paddr);
-		pgd_k = init_mm.pgd + index;
-
-		if (!pgd_present(*pgd_k))
-			goto no_context;
-
-		/*
-		 * set_pgd(pgd, *pgd_k); here would be useless on PAE
-		 * and redundant with the set_pmd() on non-PAE. As would
-		 * set_pud.
-		 */
+}
 
-		pud = pud_offset(pgd, address);
-		pud_k = pud_offset(pgd_k, address);
-		if (!pud_present(*pud_k))
-			goto no_context;
-		
-		pmd = pmd_offset(pud, address);
-		pmd_k = pmd_offset(pud_k, address);
-		if (!pmd_present(*pmd_k))
-			goto no_context;
-		set_pmd(pmd, *pmd_k);
+#ifndef CONFIG_X86_PAE
+void vmalloc_sync_all(void)
+{
+	/*
+	 * Note that races in the updates of insync and start aren't
+	 * problematic: insync can only get set bits added, and updates to
+	 * start are only improving performance (without affecting correctness
+	 * if undone).
+	 */
+	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+	static unsigned long start = TASK_SIZE;
+	unsigned long address;
 
-		pte_k = pte_offset_kernel(pmd_k, address);
-		if (!pte_present(*pte_k))
-			goto no_context;
-		return;
+	BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
+	for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
+		if (!test_bit(pgd_index(address), insync)) {
+			unsigned long flags;
+			struct page *page;
+
+			spin_lock_irqsave(&pgd_lock, flags);
+			for (page = pgd_list; page; page =
+					(struct page *)page->index)
+				if (!vmalloc_sync_one(page_address(page),
+								address)) {
+					BUG_ON(page != pgd_list);
+					break;
+				}
+			spin_unlock_irqrestore(&pgd_lock, flags);
+			if (!page)
+				set_bit(pgd_index(address), insync);
+		}
+		if (address == start && test_bit(pgd_index(address), insync))
+			start = address + PGDIR_SIZE;
 	}
 }
+#endif
diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h
index 74ef721b534..27bde973abc 100644
--- a/include/asm-i386/pgtable-2level.h
+++ b/include/asm-i386/pgtable-2level.h
@@ -61,4 +61,6 @@ static inline int pte_exec_kernel(pte_t pte)
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { (pte).pte_low })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val })
 
+void vmalloc_sync_all(void);
+
 #endif /* _I386_PGTABLE_2LEVEL_H */
diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h
index f1a8b454920..36a5aa63cbb 100644
--- a/include/asm-i386/pgtable-3level.h
+++ b/include/asm-i386/pgtable-3level.h
@@ -152,4 +152,6 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
 
 #define __pmd_free_tlb(tlb, x)		do { } while (0)
 
+#define vmalloc_sync_all() ((void)0)
+
 #endif /* _I386_PGTABLE_3LEVEL_H */
-- 
cgit v1.2.3-70-g09d2


From db753bdfc24c31228996799d508ce3bf7cbe3b99 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 23 Mar 2006 02:59:46 -0800
Subject: [PATCH] i386: fix uses of user_mode() vs. user_mode_vm()

>commit 76381fee7e8feb4c22be636aa5d4765dbe4fbf9e
>Author: Vincent Hanquez <vincent.hanquez@cl.cam.ac.uk>
>Date:   Thu Jun 23 00:08:46 2005 -0700
>
>    [PATCH] xen: x86_64: use more usermode macro
>
>    Make use of the user_mode macro where it's possible.  This is useful for Xen
>    because it will need only to redefine only the macro to a hypervisor call.

I am of the opinion that the above changeset is incomplete, i.e.  it missed
converting some previous uses of user_mode to user_mode_vm.  While most of
them could be considered just cosmetical, at least the one in die_nmi
doesn't appear to be.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Vincent Hanquez <vincent.hanquez@cl.cam.ac.uk>
Cc: Zachary Amsden <zach@vmware.com>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/crash.c                 | 2 +-
 arch/i386/kernel/process.c               | 2 +-
 arch/i386/kernel/traps.c                 | 4 ++--
 include/asm-i386/mach-default/do_timer.h | 2 +-
 include/asm-i386/mach-visws/do_timer.h   | 2 +-
 include/asm-i386/mach-voyager/do_timer.h | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/i386/kernel/traps.c')

diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index d49dbe8dc96..e3c5fca0aa8 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -105,7 +105,7 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu)
 		return 1;
 	local_irq_disable();
 
-	if (!user_mode(regs)) {
+	if (!user_mode_vm(regs)) {
 		crash_fixup_ss_esp(&fixed_regs, regs);
 		regs = &fixed_regs;
 	}
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 0480454ebff..299e6167408 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -295,7 +295,7 @@ void show_regs(struct pt_regs * regs)
 	printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
 	print_symbol("EIP is at %s\n", regs->eip);
 
-	if (user_mode(regs))
+	if (user_mode_vm(regs))
 		printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
 	printk(" EFLAGS: %08lx    %s  (%s %.*s)\n",
 	       regs->eflags, print_tainted(), system_utsname.release,
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index d510de7e4f2..a807a2da581 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -254,7 +254,7 @@ void show_registers(struct pt_regs *regs)
 
 	esp = (unsigned long) (&regs->esp);
 	savesegment(ss, ss);
-	if (user_mode(regs)) {
+	if (user_mode_vm(regs)) {
 		in_kernel = 0;
 		esp = regs->esp;
 		ss = regs->xss & 0xffff;
@@ -644,7 +644,7 @@ void die_nmi (struct pt_regs *regs, const char *msg)
 	/* If we are in kernel we are probably nested up pretty bad
 	 * and might aswell get out now while we still can.
 	*/
-	if (!user_mode(regs)) {
+	if (!user_mode_vm(regs)) {
 		current->thread.trap_no = 2;
 		crash_kexec(regs);
 	}
diff --git a/include/asm-i386/mach-default/do_timer.h b/include/asm-i386/mach-default/do_timer.h
index 56211414fc9..6312c3e7981 100644
--- a/include/asm-i386/mach-default/do_timer.h
+++ b/include/asm-i386/mach-default/do_timer.h
@@ -18,7 +18,7 @@ static inline void do_timer_interrupt_hook(struct pt_regs *regs)
 {
 	do_timer(regs);
 #ifndef CONFIG_SMP
-	update_process_times(user_mode(regs));
+	update_process_times(user_mode_vm(regs));
 #endif
 /*
  * In the SMP case we use the local APIC timer interrupt to do the
diff --git a/include/asm-i386/mach-visws/do_timer.h b/include/asm-i386/mach-visws/do_timer.h
index 92d638fc8b1..95568e6ca91 100644
--- a/include/asm-i386/mach-visws/do_timer.h
+++ b/include/asm-i386/mach-visws/do_timer.h
@@ -11,7 +11,7 @@ static inline void do_timer_interrupt_hook(struct pt_regs *regs)
 
 	do_timer(regs);
 #ifndef CONFIG_SMP
-	update_process_times(user_mode(regs));
+	update_process_times(user_mode_vm(regs));
 #endif
 /*
  * In the SMP case we use the local APIC timer interrupt to do the
diff --git a/include/asm-i386/mach-voyager/do_timer.h b/include/asm-i386/mach-voyager/do_timer.h
index ae510e5d0d7..eaf51809898 100644
--- a/include/asm-i386/mach-voyager/do_timer.h
+++ b/include/asm-i386/mach-voyager/do_timer.h
@@ -5,7 +5,7 @@ static inline void do_timer_interrupt_hook(struct pt_regs *regs)
 {
 	do_timer(regs);
 #ifndef CONFIG_SMP
-	update_process_times(user_mode(regs));
+	update_process_times(user_mode_vm(regs));
 #endif
 
 	voyager_timer_interrupt(regs);
-- 
cgit v1.2.3-70-g09d2


From 75874d5cc8efef22457072e14103dc55f164e64e Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <76306.1226@compuserve.com>
Date: Thu, 23 Mar 2006 02:59:52 -0800
Subject: [PATCH] i386: fix dump_stack()

i386 has a small bug in the stack dump code where it prints an extra log
level code.  Remove that and fix the alignment of normal stack dump
printout.  Also remove some unnecessary printk() calls.

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/traps.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch/i386/kernel/traps.c')

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index a807a2da581..1b7ad4115d8 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -217,19 +217,17 @@ static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp,
 	for(i = 0; i < kstack_depth_to_print; i++) {
 		if (kstack_end(stack))
 			break;
-		if (i && ((i % 8) == 0)) {
-			printk("\n");
-			printk("%s       ", log_lvl);
-		}
+		if (i && ((i % 8) == 0))
+			printk("\n%s       ", log_lvl);
 		printk("%08lx ", *stack++);
 	}
-	printk("\n");
-	printk("%sCall Trace:\n", log_lvl);
+	printk("\n%sCall Trace:\n", log_lvl);
 	show_trace_log_lvl(task, esp, log_lvl);
 }
 
 void show_stack(struct task_struct *task, unsigned long *esp)
 {
+	printk("       ");
 	show_stack_log_lvl(task, esp, "");
 }
 
-- 
cgit v1.2.3-70-g09d2


From dd287796d608fcdc3fe5e8fdb5bf762a8f1bc32a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 23 Mar 2006 03:00:57 -0800
Subject: [PATCH] pause_on_oops command line option

Attempt to fix the problem wherein people's oops reports scroll off the screen
due to repeated oopsing or to oopses on other CPUs.

If this happens the user can reboot with the `pause_on_oops=<seconds>' option.
It will allow the first oopsing CPU to print an oops record just a single
time.  Second oopsing attempts, or oopses on other CPUs will cause those CPUs
to enter a tight loop until the specified number of seconds have elapsed.

The patch implements the infrastructure generically in the expectation that
architectures other than x86 will find it useful.

Cc: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt |  5 ++
 arch/i386/kernel/traps.c            |  3 ++
 arch/i386/mm/fault.c                | 39 +++++++++------
 include/linux/kernel.h              |  3 ++
 kernel/panic.c                      | 97 ++++++++++++++++++++++++++++++++++++-
 5 files changed, 130 insertions(+), 17 deletions(-)

(limited to 'arch/i386/kernel/traps.c')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 880be3a30d8..7b7382d0f75 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1126,6 +1126,11 @@ running once the system is up.
 	pas16=		[HW,SCSI]
 			See header of drivers/scsi/pas16.c.
 
+	pause_on_oops=
+			Halt all CPUs after the first oops has been printed for
+			the specified number of seconds.  This is to be used if
+			your oopses keep scrolling off the screen.
+
 	pcbit=		[HW,ISDN]
 
 	pcd.		[PARIDE]
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 1b7ad4115d8..de5386b01d3 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -352,6 +352,8 @@ void die(const char * str, struct pt_regs * regs, long err)
 	static int die_counter;
 	unsigned long flags;
 
+	oops_enter();
+
 	if (die.lock_owner != raw_smp_processor_id()) {
 		console_verbose();
 		spin_lock_irqsave(&die.lock, flags);
@@ -404,6 +406,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 		ssleep(5);
 		panic("Fatal exception");
 	}
+	oops_exit();
 	do_exit(SIGSEGV);
 }
 
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 47a3b72ec7b..7f0fcf219a2 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -509,24 +509,31 @@ no_context:
 
 	bust_spinlocks(1);
 
-#ifdef CONFIG_X86_PAE
-	if (error_code & 16) {
-		pte_t *pte = lookup_address(address);
-
-		if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
-			printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid);
+	if (oops_may_print()) {
+	#ifdef CONFIG_X86_PAE
+		if (error_code & 16) {
+			pte_t *pte = lookup_address(address);
+
+			if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
+				printk(KERN_CRIT "kernel tried to execute "
+					"NX-protected page - exploit attempt? "
+					"(uid: %d)\n", current->uid);
+		}
+	#endif
+		if (address < PAGE_SIZE)
+			printk(KERN_ALERT "BUG: unable to handle kernel NULL "
+					"pointer dereference");
+		else
+			printk(KERN_ALERT "BUG: unable to handle kernel paging"
+					" request");
+		printk(" at virtual address %08lx\n",address);
+		printk(KERN_ALERT " printing eip:\n");
+		printk("%08lx\n", regs->eip);
 	}
-#endif
-	if (address < PAGE_SIZE)
-		printk(KERN_ALERT "BUG: unable to handle kernel NULL pointer dereference");
-	else
-		printk(KERN_ALERT "BUG: unable to handle kernel paging request");
-	printk(" at virtual address %08lx\n",address);
-	printk(KERN_ALERT " printing eip:\n");
-	printk("%08lx\n", regs->eip);
 	page = read_cr3();
 	page = ((unsigned long *) __va(page))[address >> 22];
-	printk(KERN_ALERT "*pde = %08lx\n", page);
+	if (oops_may_print())
+		printk(KERN_ALERT "*pde = %08lx\n", page);
 	/*
 	 * We must not directly access the pte in the highpte
 	 * case, the page table might be allocated in highmem.
@@ -534,7 +541,7 @@ no_context:
 	 * it's allocated already.
 	 */
 #ifndef CONFIG_HIGHPTE
-	if (page & 1) {
+	if ((page & 1) && oops_may_print()) {
 		page &= PAGE_MASK;
 		address &= 0x003ff000;
 		page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3b507bf05d0..bb6e7ddee2f 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -91,6 +91,9 @@ extern struct notifier_block *panic_notifier_list;
 extern long (*panic_blink)(long time);
 NORET_TYPE void panic(const char * fmt, ...)
 	__attribute__ ((NORET_AND format (printf, 1, 2)));
+extern void oops_enter(void);
+extern void oops_exit(void);
+extern int oops_may_print(void);
 fastcall NORET_TYPE void do_exit(long error_code)
 	ATTRIB_NORET;
 NORET_TYPE void complete_and_exit(struct completion *, long)
diff --git a/kernel/panic.c b/kernel/panic.c
index 126dc43f1c7..acd95adddb9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -20,10 +20,13 @@
 #include <linux/nmi.h>
 #include <linux/kexec.h>
 
-int panic_timeout;
 int panic_on_oops;
 int tainted;
+static int pause_on_oops;
+static int pause_on_oops_flag;
+static DEFINE_SPINLOCK(pause_on_oops_lock);
 
+int panic_timeout;
 EXPORT_SYMBOL(panic_timeout);
 
 struct notifier_block *panic_notifier_list;
@@ -174,3 +177,95 @@ void add_taint(unsigned flag)
 	tainted |= flag;
 }
 EXPORT_SYMBOL(add_taint);
+
+static int __init pause_on_oops_setup(char *str)
+{
+	pause_on_oops = simple_strtoul(str, NULL, 0);
+	return 1;
+}
+__setup("pause_on_oops=", pause_on_oops_setup);
+
+static void spin_msec(int msecs)
+{
+	int i;
+
+	for (i = 0; i < msecs; i++) {
+		touch_nmi_watchdog();
+		mdelay(1);
+	}
+}
+
+/*
+ * It just happens that oops_enter() and oops_exit() are identically
+ * implemented...
+ */
+static void do_oops_enter_exit(void)
+{
+	unsigned long flags;
+	static int spin_counter;
+
+	if (!pause_on_oops)
+		return;
+
+	spin_lock_irqsave(&pause_on_oops_lock, flags);
+	if (pause_on_oops_flag == 0) {
+		/* This CPU may now print the oops message */
+		pause_on_oops_flag = 1;
+	} else {
+		/* We need to stall this CPU */
+		if (!spin_counter) {
+			/* This CPU gets to do the counting */
+			spin_counter = pause_on_oops;
+			do {
+				spin_unlock(&pause_on_oops_lock);
+				spin_msec(MSEC_PER_SEC);
+				spin_lock(&pause_on_oops_lock);
+			} while (--spin_counter);
+			pause_on_oops_flag = 0;
+		} else {
+			/* This CPU waits for a different one */
+			while (spin_counter) {
+				spin_unlock(&pause_on_oops_lock);
+				spin_msec(1);
+				spin_lock(&pause_on_oops_lock);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&pause_on_oops_lock, flags);
+}
+
+/*
+ * Return true if the calling CPU is allowed to print oops-related info.  This
+ * is a bit racy..
+ */
+int oops_may_print(void)
+{
+	return pause_on_oops_flag == 0;
+}
+
+/*
+ * Called when the architecture enters its oops handler, before it prints
+ * anything.  If this is the first CPU to oops, and it's oopsing the first time
+ * then let it proceed.
+ *
+ * This is all enabled by the pause_on_oops kernel boot option.  We do all this
+ * to ensure that oopses don't scroll off the screen.  It has the side-effect
+ * of preventing later-oopsing CPUs from mucking up the display, too.
+ *
+ * It turns out that the CPU which is allowed to print ends up pausing for the
+ * right duration, whereas all the other CPUs pause for twice as long: once in
+ * oops_enter(), once in oops_exit().
+ */
+void oops_enter(void)
+{
+	do_oops_enter_exit();
+}
+
+/*
+ * Called when the architecture exits its oops handler, after printing
+ * everything.
+ */
+void oops_exit(void)
+{
+	do_oops_enter_exit();
+}
-- 
cgit v1.2.3-70-g09d2