diff options
Diffstat (limited to 'arch/x86_64/kernel')
25 files changed, 642 insertions, 225 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 819e84ec5b6..b5aaeafc1cd 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -10,6 +10,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ pci-dma.o pci-nommu.o alternative.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o diff --git a/arch/x86_64/kernel/audit.c b/arch/x86_64/kernel/audit.c index a067aa468a8..21f33387bef 100644 --- a/arch/x86_64/kernel/audit.c +++ b/arch/x86_64/kernel/audit.c @@ -8,19 +8,54 @@ static unsigned dir_class[] = { ~0U }; +static unsigned read_class[] = { +#include <asm-generic/audit_read.h> +~0U +}; + +static unsigned write_class[] = { +#include <asm-generic/audit_write.h> +~0U +}; + static unsigned chattr_class[] = { #include <asm-generic/audit_change_attr.h> ~0U }; +int audit_classify_syscall(int abi, unsigned syscall) +{ +#ifdef CONFIG_IA32_EMULATION + extern int ia32_classify_syscall(unsigned); + if (abi == AUDIT_ARCH_I386) + return ia32_classify_syscall(syscall); +#endif + switch(syscall) { + case __NR_open: + return 2; + case __NR_openat: + return 3; + case __NR_execve: + return 5; + default: + return 0; + } +} + static int __init audit_classes_init(void) { #ifdef CONFIG_IA32_EMULATION extern __u32 ia32_dir_class[]; + extern __u32 ia32_write_class[]; + extern __u32 ia32_read_class[]; extern __u32 ia32_chattr_class[]; + audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class); + audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class); audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class); audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class); #endif + audit_register_class(AUDIT_CLASS_WRITE, write_class); + audit_register_class(AUDIT_CLASS_READ, read_class); audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); audit_register_class(AUDIT_CLASS_CHATTR, chattr_class); return 0; diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index b8eee4c7888..764bf23c710 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -17,6 +17,7 @@ #include <linux/kexec.h> #include <linux/module.h> +#include <asm/pgtable.h> #include <asm/page.h> #include <asm/e820.h> #include <asm/proto.h> @@ -70,7 +71,11 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) #endif /* kernel code + 640k memory hole (later should not be needed, but be paranoid for now) */ - if (last >= 640*1024 && addr < __pa_symbol(&_end)) { + if (last >= 640*1024 && addr < 1024*1024) { + *addrp = 1024*1024; + return 1; + } + if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) { *addrp = __pa_symbol(&_end); return 1; } @@ -103,35 +108,6 @@ e820_any_mapped(unsigned long start, unsigned long end, unsigned type) return 0; } -/* - * This function checks if the entire range <start,end> is mapped with type. - * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case - */ -int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) - continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - - /* if the region is at the beginning of <start,end> we move - * start to the end of the region since it's ok until there - */ - if (ei->addr <= start) - start = ei->addr + ei->size; - /* if start is now at or beyond end, we're done, full coverage */ - if (start >= end) - return 1; /* we're done */ - } - return 0; -} - /* * Find a free area in a specific range. */ diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index b93ef5b5198..140051e07fa 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -2,7 +2,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/string.h> -#include <linux/tty.h> +#include <linux/screen_info.h> #include <asm/io.h> #include <asm/processor.h> #include <asm/fcntl.h> diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index ed92c298360..aa8d8939abc 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -41,13 +41,24 @@ #include <asm/thread_info.h> #include <asm/hw_irq.h> #include <asm/page.h> +#include <asm/irqflags.h> .code64 #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif - + + +.macro TRACE_IRQS_IRETQ offset=ARGOFFSET +#ifdef CONFIG_TRACE_IRQFLAGS + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON +1: +#endif +.endm + /* * C code is not supposed to know about undefined top of stack. Every time * a C function with an pt_regs argument is called from the SYSCALL based @@ -194,6 +205,10 @@ ENTRY(system_call) swapgs movq %rsp,%gs:pda_oldrsp movq %gs:pda_kernelstack,%rsp + /* + * No need to follow this irqs off/on section - it's straight + * and short: + */ sti SAVE_ARGS 8,1 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) @@ -219,10 +234,15 @@ ret_from_sys_call: sysret_check: GET_THREAD_INFO(%rcx) cli + TRACE_IRQS_OFF movl threadinfo_flags(%rcx),%edx andl %edi,%edx CFI_REMEMBER_STATE jnz sysret_careful + /* + * sysretq will re-enable interrupts: + */ + TRACE_IRQS_ON movq RIP-ARGOFFSET(%rsp),%rcx CFI_REGISTER rip,rcx RESTORE_ARGS 0,-ARG_SKIP,1 @@ -237,6 +257,7 @@ sysret_careful: CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc sysret_signal + TRACE_IRQS_ON sti pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -247,6 +268,7 @@ sysret_careful: /* Handle a signal */ sysret_signal: + TRACE_IRQS_ON sti testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx jz 1f @@ -261,6 +283,7 @@ sysret_signal: /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ cli + TRACE_IRQS_OFF jmp int_with_check badsys: @@ -309,6 +332,7 @@ ENTRY(int_ret_from_sys_call) CFI_REL_OFFSET r10,R10-ARGOFFSET CFI_REL_OFFSET r11,R11-ARGOFFSET cli + TRACE_IRQS_OFF testl $3,CS-ARGOFFSET(%rsp) je retint_restore_args movl $_TIF_ALLWORK_MASK,%edi @@ -327,6 +351,7 @@ int_with_check: int_careful: bt $TIF_NEED_RESCHED,%edx jnc int_very_careful + TRACE_IRQS_ON sti pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -334,10 +359,12 @@ int_careful: popq %rdi CFI_ADJUST_CFA_OFFSET -8 cli + TRACE_IRQS_OFF jmp int_with_check /* handle signals and tracing -- both require a full stack frame */ int_very_careful: + TRACE_IRQS_ON sti SAVE_REST /* Check for syscall exit trace */ @@ -351,6 +378,7 @@ int_very_careful: CFI_ADJUST_CFA_OFFSET -8 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi cli + TRACE_IRQS_OFF jmp int_restore_rest int_signal: @@ -363,6 +391,7 @@ int_signal: int_restore_rest: RESTORE_REST cli + TRACE_IRQS_OFF jmp int_with_check CFI_ENDPROC END(int_ret_from_sys_call) @@ -484,6 +513,11 @@ END(stub_rt_sigreturn) swapgs 1: incl %gs:pda_irqcount # RED-PEN should check preempt count cmoveq %gs:pda_irqstackptr,%rsp + push %rbp # backlink for old unwinder + /* + * We entered an interrupt context - irqs are off: + */ + TRACE_IRQS_OFF call \func .endm @@ -493,6 +527,7 @@ ENTRY(common_interrupt) /* 0(%rsp): oldrsp-ARGOFFSET */ ret_from_intr: cli + TRACE_IRQS_OFF decl %gs:pda_irqcount leaveq CFI_DEF_CFA_REGISTER rsp @@ -515,9 +550,21 @@ retint_check: CFI_REMEMBER_STATE jnz retint_careful retint_swapgs: + /* + * The iretq could re-enable interrupts: + */ + cli + TRACE_IRQS_IRETQ swapgs + jmp restore_args + retint_restore_args: cli + /* + * The iretq could re-enable interrupts: + */ + TRACE_IRQS_IRETQ +restore_args: RESTORE_ARGS 0,8,0 iret_label: iretq @@ -530,6 +577,7 @@ iret_label: /* running with kernel gs */ bad_iret: movq $11,%rdi /* SIGSEGV */ + TRACE_IRQS_ON sti jmp do_exit .previous @@ -539,6 +587,7 @@ retint_careful: CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc retint_signal + TRACE_IRQS_ON sti pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -547,11 +596,13 @@ retint_careful: CFI_ADJUST_CFA_OFFSET -8 GET_THREAD_INFO(%rcx) cli + TRACE_IRQS_OFF jmp retint_check retint_signal: testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx jz retint_swapgs + TRACE_IRQS_ON sti SAVE_REST movq $-1,ORIG_RAX(%rsp) @@ -560,6 +611,7 @@ retint_signal: call do_notify_resume RESTORE_REST cli + TRACE_IRQS_OFF movl $_TIF_NEED_RESCHED,%edi GET_THREAD_INFO(%rcx) jmp retint_check @@ -666,7 +718,7 @@ END(spurious_interrupt) /* error code is on the stack already */ /* handle NMI like exceptions that can happen everywhere */ - .macro paranoidentry sym, ist=0 + .macro paranoidentry sym, ist=0, irqtrace=1 SAVE_ALL cld movl $1,%ebx @@ -691,8 +743,73 @@ END(spurious_interrupt) addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) .endif cli + .if \irqtrace + TRACE_IRQS_OFF + .endif .endm - + + /* + * "Paranoid" exit path from exception stack. + * Paranoid because this is used by NMIs and cannot take + * any kernel state for granted. + * We don't do kernel preemption checks here, because only + * NMI should be common and it does not enable IRQs and + * cannot get reschedule ticks. + * + * "trace" is 0 for the NMI handler only, because irq-tracing + * is fundamentally NMI-unsafe. (we cannot change the soft and + * hard flags at once, atomically) + */ + .macro paranoidexit trace=1 + /* ebx: no swapgs flag */ +paranoid_exit\trace: + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_restore\trace + testl $3,CS(%rsp) + jnz paranoid_userspace\trace +paranoid_swapgs\trace: + TRACE_IRQS_IRETQ 0 + swapgs +paranoid_restore\trace: + RESTORE_ALL 8 + iretq +paranoid_userspace\trace: + GET_THREAD_INFO(%rcx) + movl threadinfo_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz paranoid_swapgs\trace + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz paranoid_schedule\trace + movl %ebx,%edx /* arg3: thread flags */ + .if \trace + TRACE_IRQS_ON + .endif + sti + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + cli + .if \trace + TRACE_IRQS_OFF + .endif + jmp paranoid_userspace\trace +paranoid_schedule\trace: + .if \trace + TRACE_IRQS_ON + .endif + sti + call schedule + cli + .if \trace + TRACE_IRQS_OFF + .endif + jmp paranoid_userspace\trace + CFI_ENDPROC + .endm + /* * Exception entry point. This expects an error code/orig_rax on the stack * and the exception handler in %rax. @@ -748,6 +865,7 @@ error_exit: movl %ebx,%eax RESTORE_REST cli + TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testl %eax,%eax jne retint_kernel @@ -755,6 +873,10 @@ error_exit: movl $_TIF_WORK_MASK,%edi andl %edi,%edx jnz retint_careful + /* + * The iret might restore flags: + */ + TRACE_IRQS_IRETQ swapgs RESTORE_ARGS 0,8,0 jmp iret_label @@ -851,6 +973,8 @@ ENTRY(kernel_thread) ENDPROC(kernel_thread) child_rip: + pushq $0 # fake return address + CFI_STARTPROC /* * Here we are in the child and the registers are set as they were * at kernel_thread() invocation in the parent. @@ -861,6 +985,7 @@ child_rip: # exit xorl %edi, %edi call do_exit + CFI_ENDPROC ENDPROC(child_rip) /* @@ -916,8 +1041,7 @@ KPROBE_ENTRY(debug) pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug, DEBUG_STACK - jmp paranoid_exit - CFI_ENDPROC + paranoidexit END(debug) .previous .text @@ -926,49 +1050,13 @@ KPROBE_ENTRY(nmi) INTR_FRAME pushq $-1 CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_nmi - /* - * "Paranoid" exit path from exception stack. - * Paranoid because this is used by NMIs and cannot take - * any kernel state for granted. - * We don't do kernel preemption checks here, because only - * NMI should be common and it does not enable IRQs and - * cannot get reschedule ticks. - */ - /* ebx: no swapgs flag */ -paranoid_exit: - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore - testl $3,CS(%rsp) - jnz paranoid_userspace -paranoid_swapgs: - swapgs -paranoid_restore: - RESTORE_ALL 8 - iretq -paranoid_userspace: - GET_THREAD_INFO(%rcx) - movl threadinfo_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz paranoid_swapgs - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz paranoid_schedule - movl %ebx,%edx /* arg3: thread flags */ - sti - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - cli - jmp paranoid_userspace -paranoid_schedule: - sti - call schedule - cli - jmp paranoid_userspace - CFI_ENDPROC + paranoidentry do_nmi, 0, 0 +#ifdef CONFIG_TRACE_IRQFLAGS + paranoidexit 0 +#else + jmp paranoid_exit1 + CFI_ENDPROC +#endif END(nmi) .previous .text @@ -977,7 +1065,7 @@ KPROBE_ENTRY(int3) pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_int3, DEBUG_STACK - jmp paranoid_exit + jmp paranoid_exit1 CFI_ENDPROC END(int3) .previous .text @@ -1006,7 +1094,7 @@ END(reserved) ENTRY(double_fault) XCPT_FRAME paranoidentry do_double_fault - jmp paranoid_exit + jmp paranoid_exit1 CFI_ENDPROC END(double_fault) @@ -1022,7 +1110,7 @@ END(segment_not_present) ENTRY(stack_segment) XCPT_FRAME paranoidentry do_stack_segment - jmp paranoid_exit + jmp paranoid_exit1 CFI_ENDPROC END(stack_segment) @@ -1050,23 +1138,26 @@ ENTRY(machine_check) pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_machine_check - jmp paranoid_exit + jmp paranoid_exit1 CFI_ENDPROC END(machine_check) #endif +/* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(call_softirq) CFI_STARTPROC - movq %gs:pda_irqstackptr,%rax - movq %rsp,%rdx - CFI_DEF_CFA_REGISTER rdx + push %rbp + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbp,0 + mov %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp incl %gs:pda_irqcount - cmove %rax,%rsp - pushq %rdx - /*todo CFI_DEF_CFA_EXPRESSION ...*/ + cmove %gs:pda_irqstackptr,%rsp + push %rbp # backlink for old unwinder call __do_softirq - popq %rsp + leaveq CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -8 decl %gs:pda_irqcount ret CFI_ENDPROC diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 6df05e6034f..c9739ca81d0 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -191,6 +191,7 @@ startup_64: * jump */ movq initial_code(%rip),%rax + pushq $0 # fake return address jmp *%rax /* SMP bootup changes these two */ diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index e6a71c9556d..36647ce6aec 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -85,6 +85,11 @@ void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); /* + * This must be called really, really early: + */ + lockdep_init(); + + /* * switch to init_level4_pgt from boot_level4_pgt */ memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t)); diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c index ce31d904d60..3dc5854ba21 100644 --- a/arch/x86_64/kernel/init_task.c +++ b/arch/x86_64/kernel/init_task.c @@ -46,4 +46,9 @@ EXPORT_SYMBOL(init_task); */ DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS; +/* Copies of the original ist values from the tss are only accessed during + * debugging, no special alignment required. + */ +DEFINE_PER_CPU(struct orig_ist, orig_ist); + #define ALIGN_TO_4K __attribute__((section(".data.init_task"))) diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index a1f1df5f7bf..5221a53e90c 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -177,8 +177,10 @@ asmlinkage void do_softirq(void) local_irq_save(flags); pending = local_softirq_pending(); /* Switch to interrupt stack */ - if (pending) + if (pending) { call_softirq(); + WARN_ON_ONCE(softirq_count()); + } local_irq_restore(flags); } EXPORT_SYMBOL(do_softirq); diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index 83fb24a0282..106076b370f 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -207,14 +207,11 @@ NORET_TYPE void machine_kexec(struct kimage *image) __flush_tlb(); - /* The segment registers are funny things, they are - * automatically loaded from a table, in memory wherever you - * set them to a specific selector, but this table is never - * accessed again unless you set the segment to a different selector. - * - * The more common model are caches where the behide - * the scenes work is done, but is also dropped at arbitrary - * times. + /* The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is + * set to a specific selector, the invisible part is loaded + * with from a table in memory. At no other time is the + * descriptor table in memory accessed. * * I take advantage of this here by force loading the * segments, before I zap the gdt with an invalid value. diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 88845674c66..4e017fb30fb 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -615,7 +615,7 @@ static __cpuinit int mce_create_device(unsigned int cpu) } #ifdef CONFIG_HOTPLUG_CPU -static __cpuinit void mce_remove_device(unsigned int cpu) +static void mce_remove_device(unsigned int cpu) { int i; @@ -626,10 +626,9 @@ static __cpuinit void mce_remove_device(unsigned int cpu) sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); sysdev_unregister(&per_cpu(device_mce,cpu)); } -#endif /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static __cpuinit int +static int mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -638,18 +637,17 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_ONLINE: mce_create_device(cpu); break; -#ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: mce_remove_device(cpu); break; -#endif } return NOTIFY_OK; } -static struct notifier_block __cpuinitdata mce_cpu_notifier = { +static struct notifier_block mce_cpu_notifier = { .notifier_call = mce_cpu_callback, }; +#endif static __init int mce_init_device(void) { @@ -664,7 +662,7 @@ static __init int mce_init_device(void) mce_create_device(i); } - register_cpu_notifier(&mce_cpu_notifier); + register_hotcpu_notifier(&mce_cpu_notifier); misc_register(&mce_log_device); return err; } diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index 335200aa273..883fe747f64 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -558,7 +558,7 @@ out: * of shared sysfs dir/files, and rest of the cores will be symlinked to it. */ -static __cpuinit void deallocate_threshold_block(unsigned int cpu, +static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) { struct threshold_block *pos = NULL; @@ -578,7 +578,7 @@ static __cpuinit void deallocate_threshold_block(unsigned int cpu, per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; } -static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank) +static void threshold_remove_bank(unsigned int cpu, int bank) { int i = 0; struct threshold_bank *b; @@ -597,7 +597,7 @@ static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank) /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); - per_cpu(threshold_banks, i)[bank] = NULL; + per_cpu(threshold_banks, cpu)[bank] = NULL; return; } @@ -618,7 +618,7 @@ free_out: per_cpu(threshold_banks, cpu)[bank] = NULL; } -static __cpuinit void threshold_remove_device(unsigned int cpu) +static void threshold_remove_device(unsigned int cpu) { unsigned int bank; @@ -629,14 +629,8 @@ static __cpuinit void threshold_remove_device(unsigned int cpu) } } -#else /* !CONFIG_HOTPLUG_CPU */ -static void threshold_remove_device(unsigned int cpu) -{ -} -#endif - /* get notified when a cpu comes on/off */ -static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, +static int threshold_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { /* cpu was unsigned int to begin with */ @@ -659,9 +653,10 @@ static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block threshold_cpu_notifier __cpuinitdata = { +static struct notifier_block threshold_cpu_notifier = { .notifier_call = threshold_cpu_callback, }; +#endif /* CONFIG_HOTPLUG_CPU */ static __init int threshold_init_device(void) { @@ -673,7 +668,7 @@ static __init int threshold_init_device(void) if (err) return err; } - register_cpu_notifier(&threshold_cpu_notifier); + register_hotcpu_notifier(&threshold_cpu_notifier); return 0; } diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 476c1472fc0..5baa0c726e9 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -127,7 +127,7 @@ void __cpuinit nmi_watchdog_default(void) static __init void nmi_cpu_busy(void *data) { volatile int *endflag = data; - local_irq_enable(); + local_irq_enable_in_hardirq(); /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index d91cb843f54..146924ba5df 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -1,9 +1,11 @@ /* * Derived from arch/powerpc/kernel/iommu.c * - * Copyright (C) 2006 Jon Mason <jdmason@us.ibm.com>, IBM Corporation - * Copyright (C) 2006 Muli Ben-Yehuda <muli@il.ibm.com>, IBM Corporation + * Copyright (C) IBM Corporation, 2006 * + * Author: Jon Mason <jdmason@us.ibm.com> + * Author: Muli Ben-Yehuda <muli@il.ibm.com> + * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -83,7 +85,8 @@ #define CSR_AGENT_MASK 0xffe0ffff #define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ -#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * 2) /* max dev->bus->number */ +#define MAX_NUM_CHASSIS 8 /* max number of chassis */ +#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) /* max dev->bus->number */ #define PHBS_PER_CALGARY 4 /* register offsets in Calgary's internal register space */ @@ -108,7 +111,8 @@ static const unsigned long phb_offsets[] = { 0xB000 /* PHB3 */ }; -void* tce_table_kva[MAX_NUM_OF_PHBS * MAX_NUMNODES]; +static char bus_to_phb[MAX_PHB_BUS_NUM]; +void* tce_table_kva[MAX_PHB_BUS_NUM]; unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; static int translate_empty_slots __read_mostly = 0; static int calgary_detected __read_mostly = 0; @@ -117,7 +121,7 @@ static int calgary_detected __read_mostly = 0; * the bitmap of PHBs the user requested that we disable * translation on. */ -static DECLARE_BITMAP(translation_disabled, MAX_NUMNODES * MAX_PHB_BUS_NUM); +static DECLARE_BITMAP(translation_disabled, MAX_PHB_BUS_NUM); static void tce_cache_blast(struct iommu_table *tbl); @@ -450,7 +454,7 @@ static struct dma_mapping_ops calgary_dma_ops = { static inline int busno_to_phbid(unsigned char num) { - return bus_to_phb(num) % PHBS_PER_CALGARY; + return bus_to_phb[num]; } static inline unsigned long split_queue_offset(unsigned char num) @@ -810,7 +814,7 @@ static int __init calgary_init(void) int i, ret = -ENODEV; struct pci_dev *dev = NULL; - for (i = 0; i <= num_online_nodes() * MAX_NUM_OF_PHBS; i++) { + for (i = 0; i < MAX_PHB_BUS_NUM; i++) { dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CALGARY, dev); @@ -820,7 +824,7 @@ static int __init calgary_init(void) calgary_init_one_nontraslated(dev); continue; } - if (!tce_table_kva[i] && !translate_empty_slots) { + if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) { pci_dev_put(dev); continue; } @@ -840,7 +844,7 @@ error: pci_dev_put(dev); continue; } - if (!tce_table_kva[i] && !translate_empty_slots) + if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) continue; calgary_disable_translation(dev); calgary_free_tar(dev); @@ -874,9 +878,10 @@ static inline int __init determine_tce_table_size(u64 ram) void __init detect_calgary(void) { u32 val; - int bus, table_idx; + int bus; void *tbl; - int detected = 0; + int calgary_found = 0; + int phb = -1; /* * if the user specified iommu=off or iommu=soft or we found @@ -887,38 +892,46 @@ void __init detect_calgary(void) specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); - for (bus = 0, table_idx = 0; - bus <= num_online_nodes() * MAX_PHB_BUS_NUM; - bus++) { - BUG_ON(bus > MAX_NUMNODES * MAX_PHB_BUS_NUM); + for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { + int dev; + + tce_table_kva[bus] = NULL; + bus_to_phb[bus] = -1; + if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) continue; + + /* + * There are 4 PHBs per Calgary chip. Set phb to which phb (0-3) + * it is connected to releative to the clagary chip. + */ + phb = (phb + 1) % PHBS_PER_CALGARY; + if (test_bit(bus, translation_disabled)) { printk(KERN_INFO "Calgary: translation is disabled for " "PHB 0x%x\n", bus); /* skip this phb, don't allocate a tbl for it */ - tce_table_kva[table_idx] = NULL; - table_idx++; continue; } /* - * scan the first slot of the PCI bus to see if there - * are any devices present + * Scan the slots of the PCI bus to see if there is a device present. + * The parent bus will be the zero-ith device, so start at 1. */ - val = read_pci_config(bus, 1, 0, 0); - if (val != 0xffffffff || translate_empty_slots) { - tbl = alloc_tce_table(); - if (!tbl) - goto cleanup; - detected = 1; - } else - tbl = NULL; - - tce_table_kva[table_idx] = tbl; - table_idx++; + for (dev = 1; dev < 8; dev++) { + val = read_pci_config(bus, dev, 0, 0); + if (val != 0xffffffff || translate_empty_slots) { + tbl = alloc_tce_table(); + if (!tbl) + goto cleanup; + tce_table_kva[bus] = tbl; + bus_to_phb[bus] = phb; + calgary_found = 1; + break; + } + } } - if (detected) { + if (calgary_found) { iommu_detected = 1; calgary_detected = 1; printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. " @@ -927,9 +940,9 @@ void __init detect_calgary(void) return; cleanup: - for (--table_idx; table_idx >= 0; --table_idx) - if (tce_table_kva[table_idx]) - free_tce_table(tce_table_kva[table_idx]); + for (--bus; bus >= 0; --bus) + if (tce_table_kva[bus]) + free_tce_table(tce_table_kva[bus]); } int __init calgary_iommu_init(void) @@ -1000,7 +1013,7 @@ static int __init calgary_parse_options(char *p) if (p == endp) break; - if (bridge <= (num_online_nodes() * MAX_PHB_BUS_NUM)) { + if (bridge < MAX_PHB_BUS_NUM) { printk(KERN_INFO "Calgary: disabling " "translation for PHB 0x%x\n", bridge); set_bit(bridge, translation_disabled); diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index c4c3cc36ac5..aad7609d8e9 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -92,5 +92,7 @@ void __init no_iommu_init(void) { if (dma_ops) return; + + force_iommu = 0; /* no HW IOMMU */ dma_ops = &nommu_dma_ops; } diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c index ebdb77fe205..6a55f87ba97 100644 --- a/arch/x86_64/kernel/pci-swiotlb.c +++ b/arch/x86_64/kernel/pci-swiotlb.c @@ -31,9 +31,10 @@ struct dma_mapping_ops swiotlb_dma_ops = { void pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ - if (!iommu_detected && !no_iommu && - (end_pfn > MAX_DMA32_PFN || force_iommu)) + if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN) swiotlb = 1; + if (swiotlb_force) + swiotlb = 1; if (swiotlb) { printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); swiotlb_init(); diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index ca56e19b8b6..bb6745d13b8 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -296,7 +296,7 @@ void __show_regs(struct pt_regs * regs) system_utsname.version); printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); printk_address(regs->rip); - printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->rax, regs->rbx, regs->rcx); diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 0925518b58d..34afad70482 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -21,7 +21,7 @@ #include <linux/slab.h> #include <linux/user.h> #include <linux/a.out.h> -#include <linux/tty.h> +#include <linux/screen_info.h> #include <linux/ioport.h> #include <linux/delay.h> #include <linux/init.h> @@ -521,8 +521,6 @@ static void discover_ebda(void) void __init setup_arch(char **cmdline_p) { - unsigned long kernel_end; - ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); screen_info = SCREEN_INFO; edid_info = EDID_INFO; @@ -596,8 +594,8 @@ void __init setup_arch(char **cmdline_p) (table_end - table_start) << PAGE_SHIFT); /* reserve kernel */ - kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE); - reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY); + reserve_bootmem_generic(__pa_symbol(&_text), + __pa_symbol(&_end) - __pa_symbol(&_text)); /* * reserve physical page 0 - it's a special BIOS page on many boxes, diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 6fe58a634b5..417de564456 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -189,6 +189,7 @@ void __cpuinit cpu_init (void) { int cpu = stack_smp_processor_id(); struct tss_struct *t = &per_cpu(init_tss, cpu); + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); unsigned long v; char *estacks = NULL; struct task_struct *me; @@ -256,7 +257,7 @@ void __cpuinit cpu_init (void) estacks += EXCEPTION_STKSZ; break; } - t->ist[v] = (unsigned long)estacks; + orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; } t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 5a1c0a3bf87..06af6ca6012 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -203,7 +203,7 @@ int __cpuinit init_smp_flush(void) { int i; for_each_cpu_mask(i, cpu_possible_map) { - spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i)); + spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); } return 0; } diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 9705a6a384f..975380207b4 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -771,7 +771,7 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) unsigned long start_rip; struct create_idle c_idle = { .cpu = cpu, - .done = COMPLETION_INITIALIZER(c_idle.done), + .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; DECLARE_WORK(work, do_fork_idle, &c_idle); diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c new file mode 100644 index 00000000000..32cf55eb9af --- /dev/null +++ b/arch/x86_64/kernel/stacktrace.c @@ -0,0 +1,221 @@ +/* + * arch/x86_64/kernel/stacktrace.c + * + * Stack trace management functions + * + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + */ +#include <linux/sched.h> +#include <linux/stacktrace.h> + +#include <asm/smp.h> + +static inline int +in_range(unsigned long start, unsigned long addr, unsigned long end) +{ + return addr >= start && addr <= end; +} + +static unsigned long +get_stack_end(struct task_struct *task, unsigned long stack) +{ + unsigned long stack_start, stack_end, flags; + int i, cpu; + + /* + * The most common case is that we are in the task stack: + */ + stack_start = (unsigned long)task->thread_info; + stack_end = stack_start + THREAD_SIZE; + + if (in_range(stack_start, stack, stack_end)) + return stack_end; + + /* + * We are in an interrupt if irqstackptr is set: + */ + raw_local_irq_save(flags); + cpu = safe_smp_processor_id(); + stack_end = (unsigned long)cpu_pda(cpu)->irqstackptr; + + if (stack_end) { + stack_start = stack_end & ~(IRQSTACKSIZE-1); + if (in_range(stack_start, stack, stack_end)) + goto out_restore; + /* + * We get here if we are in an IRQ context but we + * are also in an exception stack. + */ + } + + /* + * Iterate over all exception stacks, and figure out whether + * 'stack' is in one of them: + */ + for (i = 0; i < N_EXCEPTION_STACKS; i++) { + /* + * set 'end' to the end of the exception stack. + */ + stack_end = per_cpu(init_tss, cpu).ist[i]; + stack_start = stack_end - EXCEPTION_STKSZ; + + /* + * Is 'stack' above this exception frame's end? + * If yes then skip to the next frame. + */ + if (stack >= stack_end) + continue; + /* + * Is 'stack' above this exception frame's start address? + * If yes then we found the right frame. + */ + if (stack >= stack_start) + goto out_restore; + + /* + * If this is a debug stack, and if it has a larger size than + * the usual exception stacks, then 'stack' might still + * be within the lower portion of the debug stack: + */ +#if DEBUG_STKSZ > EXCEPTION_STKSZ + if (i == DEBUG_STACK - 1 && stack >= stack_end - DEBUG_STKSZ) { + /* + * Black magic. A large debug stack is composed of + * multiple exception stack entries, which we + * iterate through now. Dont look: + */ + do { + stack_end -= EXCEPTION_STKSZ; + stack_start -= EXCEPTION_STKSZ; + } while (stack < stack_start); + + goto out_restore; + } +#endif + } + /* + * Ok, 'stack' is not pointing to any of the system stacks. + */ + stack_end = 0; + +out_restore: + raw_local_irq_restore(flags); + + return stack_end; +} + + +/* + * Save stack-backtrace addresses into a stack_trace buffer: + */ +static inline unsigned long +save_context_stack(struct stack_trace *trace, unsigned int skip, + unsigned long stack, unsigned long stack_end) +{ + unsigned long addr; + +#ifdef CONFIG_FRAME_POINTER + unsigned long prev_stack = 0; + + while (in_range(prev_stack, stack, stack_end)) { + pr_debug("stack: %p\n", (void *)stack); + addr = (unsigned long)(((unsigned long *)stack)[1]); + pr_debug("addr: %p\n", (void *)addr); + if (!skip) + trace->entries[trace->nr_entries++] = addr-1; + else + skip--; + if (trace->nr_entries >= trace->max_entries) + break; + if (!addr) + return 0; + /* + * Stack frames must go forwards (otherwise a loop could + * happen if the stackframe is corrupted), so we move + * prev_stack forwards: + */ + prev_stack = stack; + stack = (unsigned long)(((unsigned long *)stack)[0]); + } + pr_debug("invalid: %p\n", (void *)stack); +#else + while (stack < stack_end) { + addr = ((unsigned long *)stack)[0]; + stack += sizeof(long); + if (__kernel_text_address(addr)) { + if (!skip) + trace->entries[trace->nr_entries++] = addr-1; + else + skip--; + if (trace->nr_entries >= trace->max_entries) + break; + } + } +#endif + return stack; +} + +#define MAX_STACKS 10 + +/* + * Save stack-backtrace addresses into a stack_trace buffer. + * If all_contexts is set, all contexts (hardirq, softirq and process) + * are saved. If not set then only the current context is saved. + */ +void save_stack_trace(struct stack_trace *trace, + struct task_struct *task, int all_contexts, + unsigned int skip) +{ + unsigned long stack = (unsigned long)&stack; + int i, nr_stacks = 0, stacks_done[MAX_STACKS]; + + WARN_ON(trace->nr_entries || !trace->max_entries); + + if (!task) + task = current; + + pr_debug("task: %p, ti: %p\n", task, task->thread_info); + + if (!task || task == current) { + /* Grab rbp right from our regs: */ + asm ("mov %%rbp, %0" : "=r" (stack)); + pr_debug("rbp: %p\n", (void *)stack); + } else { + /* rbp is the last reg pushed by switch_to(): */ + stack = task->thread.rsp; + pr_debug("other task rsp: %p\n", (void *)stack); + stack = (unsigned long)(((unsigned long *)stack)[0]); + pr_debug("other task rbp: %p\n", (void *)stack); + } + + while (1) { + unsigned long stack_end = get_stack_end(task, stack); + + pr_debug("stack: %p\n", (void *)stack); + pr_debug("stack end: %p\n", (void *)stack_end); + + /* + * Invalid stack addres? + */ + if (!stack_end) + return; + /* + * Were we in this stack already? (recursion) + */ + for (i = 0; i < nr_stacks; i++) + if (stacks_done[i] == stack_end) + return; + stacks_done[nr_stacks] = stack_end; + + stack = save_context_stack(trace, skip, stack, stack_end); + if (!all_contexts || !stack || + trace->nr_entries >= trace->max_entries) + return; + trace->entries[trace->nr_entries++] = ULONG_MAX; + if (trace->nr_entries >= trace->max_entries) + return; + if (++nr_stacks >= MAX_STACKS) + return; + } +} + diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c index 8d4c67f61b8..5530dda3f27 100644 --- a/arch/x86_64/kernel/tce.c +++ b/arch/x86_64/kernel/tce.c @@ -1,8 +1,10 @@ /* * Derived from arch/powerpc/platforms/pseries/iommu.c * - * Copyright (C) 2006 Jon Mason <jdmason@us.ibm.com>, IBM Corporation - * Copyright (C) 2006 Muli Ben-Yehuda <muli@il.ibm.com>, IBM Corporation + * Copyright (C) IBM Corporation, 2006 + * + * Author: Jon Mason <jdmason@us.ibm.com> + * Author: Muli Ben-Yehuda <muli@il.ibm.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -94,7 +96,6 @@ static inline unsigned int table_size_to_number_of_entries(unsigned char size) static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) { unsigned int bitmapsz; - unsigned int tce_table_index; unsigned long bmppages; int ret; @@ -103,8 +104,7 @@ static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) /* set the tce table size - measured in entries */ tbl->it_size = table_size_to_number_of_entries(specified_table_size); - tce_table_index = bus_to_phb(tbl->it_busno); - tbl->it_base = (unsigned long)tce_table_kva[tce_table_index]; + tbl->it_base = (unsigned long)tce_table_kva[dev->bus->number]; if (!tbl->it_base) { printk(KERN_ERR "Calgary: iommu_table_setparms: " "no table allocated?!\n"); diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index b9ff75992c1..7a9b1822418 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -28,6 +28,7 @@ #include <linux/acpi.h> #ifdef CONFIG_ACPI #include <acpi/achware.h> /* for PM timer frequency */ +#include <acpi/acpi_bus.h> #endif #include <asm/8253pit.h> #include <asm/pgtable.h> @@ -193,7 +194,7 @@ unsigned long profile_pc(struct pt_regs *regs) is just accounted to the spinlock function. Better would be to write these functions in assembler again and check exactly. */ - if (in_lock_functions(pc)) { + if (!user_mode(regs) && in_lock_functions(pc)) { char *v = *(char **)regs->rsp; if ((v >= _stext && v <= _etext) || (v >= _sinittext && v <= _einittext) || @@ -953,11 +954,18 @@ __cpuinit int unsynchronized_tsc(void) #ifdef CONFIG_SMP if (apic_is_clustered_box()) return 1; - /* Intel systems are normally all synchronized. Exceptions - are handled in the check above. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - return 0; #endif + /* Most intel systems have synchronized TSCs except for + multi node systems */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { +#ifdef CONFIG_ACPI + /* But TSC doesn't tick in C3 so don't use it there */ + if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 100) + return 1; +#endif + return 0; + } + /* Assume multi socket systems are not synchronized */ return num_present_cpus() > 1; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 5a5311d3de0..b1249774d1e 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -76,13 +76,13 @@ int register_die_notifier(struct notifier_block *nb) vmalloc_sync_all(); return atomic_notifier_chain_register(&die_chain, nb); } -EXPORT_SYMBOL(register_die_notifier); +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */ int unregister_die_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&die_chain, nb); } -EXPORT_SYMBOL(unregister_die_notifier); +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */ static inline void conditional_sti(struct pt_regs *regs) { @@ -107,31 +107,38 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) } static int kstack_depth_to_print = 12; +#ifdef CONFIG_STACK_UNWIND static int call_trace = 1; +#else +#define call_trace (-1) +#endif #ifdef CONFIG_KALLSYMS -#include <linux/kallsyms.h> -int printk_address(unsigned long address) -{ +# include <linux/kallsyms.h> +void printk_address(unsigned long address) +{ unsigned long offset = 0, symsize; const char *symname; char *modname; - char *delim = ":"; + char *delim = ":"; char namebuf[128]; - symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); - if (!symname) - return printk("[<%016lx>]", address); - if (!modname) + symname = kallsyms_lookup(address, &symsize, &offset, + &modname, namebuf); + if (!symname) { + printk(" [<%016lx>]\n", address); + return; + } + if (!modname) modname = delim = ""; - return printk("<%016lx>{%s%s%s%s%+ld}", - address, delim, modname, delim, symname, offset); -} + printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", + address, delim, modname, delim, symname, offset, symsize); +} #else -int printk_address(unsigned long address) -{ - return printk("[<%016lx>]", address); -} +void printk_address(unsigned long address) +{ + printk(" [<%016lx>]\n", address); +} #endif static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, @@ -149,32 +156,68 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, }; unsigned k; + /* + * Iterate over all exception stacks, and figure out whether + * 'stack' is in one of them: + */ for (k = 0; k < N_EXCEPTION_STACKS; k++) { unsigned long end; + /* + * set 'end' to the end of the exception stack. + */ switch (k + 1) { + /* + * TODO: this block is not needed i think, because + * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK] + * properly too. + */ #if DEBUG_STKSZ > EXCEPTION_STKSZ case DEBUG_STACK: end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ; break; #endif default: - end = per_cpu(init_tss, cpu).ist[k]; + end = per_cpu(orig_ist, cpu).ist[k]; break; } + /* + * Is 'stack' above this exception frame's end? + * If yes then skip to the next frame. + */ if (stack >= end) continue; + /* + * Is 'stack' above this exception frame's start address? + * If yes then we found the right frame. + */ if (stack >= end - EXCEPTION_STKSZ) { + /* + * Make sure we only iterate through an exception + * stack once. If it comes up for the second time + * then there's something wrong going on - just + * break out and return NULL: + */ if (*usedp & (1U << k)) break; *usedp |= 1U << k; *idp = ids[k]; return (unsigned long *)end; } + /* + * If this is a debug stack, and if it has a larger size than + * the usual exception stacks, then 'stack' might still + * be within the lower portion of the debug stack: + */ #if DEBUG_STKSZ > EXCEPTION_STKSZ if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { unsigned j = N_EXCEPTION_STACKS - 1; + /* + * Black magic. A large debug stack is composed of + * multiple exception stack entries, which we + * iterate through now. Dont look: + */ do { ++j; end -= EXCEPTION_STKSZ; @@ -193,20 +236,14 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, static int show_trace_unwind(struct unwind_frame_info *info, void *context) { - int i = 11, n = 0; + int n = 0; while (unwind(info) == 0 && UNW_PC(info)) { - ++n; - if (i > 50) { - printk("\n "); - i = 7; - } else - i += printk(" "); - i += printk_address(UNW_PC(info)); + n++; + printk_address(UNW_PC(info)); if (arch_unw_user_mode(info)) break; } - printk("\n"); return n; } @@ -221,10 +258,9 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s { const unsigned cpu = safe_smp_processor_id(); unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; - int i = 11; unsigned used = 0; - printk("\nCall Trace:"); + printk("\nCall Trace:\n"); if (!tsk) tsk = current; @@ -243,23 +279,31 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s unw_ret = show_trace_unwind(&info, NULL); } if (unw_ret > 0) { - if (call_trace > 0) + if (call_trace == 1 && !arch_unw_user_mode(&info)) { + print_symbol("DWARF2 unwinder stuck at %s\n", + UNW_PC(&info)); + if ((long)UNW_SP(&info) < 0) { + printk("Leftover inexact backtrace:\n"); + stack = (unsigned long *)UNW_SP(&info); + } else + printk("Full inexact backtrace again:\n"); + } else if (call_trace >= 1) return; - printk("Legacy call trace:"); - i = 18; - } + else + printk("Full inexact backtrace again:\n"); + } else + printk("Inexact backtrace:\n"); } + /* + * Print function call entries within a stack. 'cond' is the + * "end of stackframe" condition, that the 'stack++' + * iteration will eventually trigger. + */ #define HANDLE_STACK(cond) \ do while (cond) { \ unsigned long addr = *stack++; \ if (kernel_text_address(addr)) { \ - if (i > 50) { \ - printk("\n "); \ - i = 0; \ - } \ - else \ - i += printk(" "); \ /* \ * If the address is either in the text segment of the \ * kernel, or in the region which contains vmalloc'ed \ @@ -268,20 +312,30 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s * down the cause of the crash will be able to figure \ * out the call path that was taken. \ */ \ - i += printk_address(addr); \ + printk_address(addr); \ } \ } while (0) - for(; ; ) { + /* + * Print function call entries in all stacks, starting at the + * current stack address. If the stacks consist of nested + * exceptions + */ + for ( ; ; ) { const char *id; unsigned long *estack_end; estack_end = in_exception_stack(cpu, (unsigned long)stack, &used, &id); if (estack_end) { - i += printk(" <%s>", id); + printk(" <%s>", id); HANDLE_STACK (stack < estack_end); - i += printk(" <EOE>"); + printk(" <EOE>"); + /* + * We link to the next stack via the + * second-to-last pointer (index -2 to end) in the + * exception stack: + */ stack = (unsigned long *) estack_end[-2]; continue; } @@ -291,19 +345,28 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s (IRQSTACKSIZE - 64) / sizeof(*irqstack); if (stack >= irqstack && stack < irqstack_end) { - i += printk(" <IRQ>"); + printk(" <IRQ>"); HANDLE_STACK (stack < irqstack_end); + /* + * We link to the next stack (which would be + * the process stack normally) the last + * pointer (index -1 to end) in the IRQ stack: + */ stack = (unsigned long *) (irqstack_end[-1]); irqstack_end = NULL; - i += printk(" <EOI>"); + printk(" <EOI>"); continue; } } break; } + /* + * This prints the process stack: + */ HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); #undef HANDLE_STACK + printk("\n"); } @@ -337,8 +400,8 @@ static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned break; } if (i && ((i % 4) == 0)) - printk("\n "); - printk("%016lx ", *stack++); + printk("\n"); + printk(" %016lx", *stack++); touch_nmi_watchdog(); } show_trace(tsk, regs, rsp); @@ -470,7 +533,7 @@ void __kprobes oops_end(unsigned long flags) /* Nest count reaches zero, release the lock. */ spin_unlock_irqrestore(&die_lock, flags); if (panic_on_oops) - panic("Oops"); + panic("Fatal exception"); } void __kprobes __die(const char * str, struct pt_regs * regs, long err) @@ -1061,14 +1124,18 @@ static int __init kstack_setup(char *s) } __setup("kstack=", kstack_setup); +#ifdef CONFIG_STACK_UNWIND static int __init call_trace_setup(char *s) { if (strcmp(s, "old") == 0) call_trace = -1; else if (strcmp(s, "both") == 0) call_trace = 0; - else if (strcmp(s, "new") == 0) + else if (strcmp(s, "newfallback") == 0) call_trace = 1; + else if (strcmp(s, "new") == 0) + call_trace = 2; return 1; } __setup("call_trace=", call_trace_setup); +#endif |