diff options
-rw-r--r-- | arch/powerpc/include/asm/pgtable.h | 6 | ||||
-rw-r--r-- | arch/powerpc/kernel/Makefile | 2 | ||||
-rw-r--r-- | arch/powerpc/kernel/asm-offsets.c | 2 | ||||
-rw-r--r-- | arch/powerpc/kernel/exceptions-64s.S | 19 | ||||
-rw-r--r-- | arch/powerpc/kernel/perf_callchain.c | 527 | ||||
-rw-r--r-- | arch/powerpc/mm/slb.c | 37 | ||||
-rw-r--r-- | arch/powerpc/mm/stab.c | 11 | ||||
-rw-r--r-- | include/linux/perf_counter.h | 5 | ||||
-rw-r--r-- | kernel/perf_counter.c | 143 | ||||
-rw-r--r-- | tools/perf/Documentation/Makefile | 2 | ||||
-rw-r--r-- | tools/perf/Documentation/examples.txt (renamed from tools/perf/Documentation/perf-examples.txt) | 0 | ||||
-rw-r--r-- | tools/perf/builtin-annotate.c | 14 | ||||
-rw-r--r-- | tools/perf/builtin-report.c | 32 | ||||
-rw-r--r-- | tools/perf/builtin-top.c | 17 | ||||
-rw-r--r-- | tools/perf/util/thread.c | 25 | ||||
-rw-r--r-- | tools/perf/util/util.h | 4 |
16 files changed, 798 insertions, 48 deletions
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index eb17da78112..2a5da069714 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -104,8 +104,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, else pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte)); -#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) && defined(CONFIG_SMP) - /* Second case is 32-bit with 64-bit PTE in SMP mode. In this case, we +#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) + /* Second case is 32-bit with 64-bit PTE. In this case, we * can just store as long as we do the two halves in the right order * with a barrier in between. This is possible because we take care, * in the hash code, to pre-invalidate if the PTE was already hashed, @@ -140,7 +140,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, #else /* Anything else just stores the PTE normally. That covers all 64-bit - * cases, and 32-bit non-hash with 64-bit PTEs in UP mode + * cases, and 32-bit non-hash with 32-bit PTEs. */ *ptep = pte; #endif diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index b73396b9390..9619285f64e 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -97,7 +97,7 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o -obj-$(CONFIG_PPC_PERF_CTRS) += perf_counter.o +obj-$(CONFIG_PPC_PERF_CTRS) += perf_counter.o perf_callchain.o obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \ power5+-pmu.o power6-pmu.o power7-pmu.o obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 561b6465231..197b15646ee 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -67,6 +67,8 @@ int main(void) DEFINE(MMCONTEXTID, offsetof(struct mm_struct, context.id)); #ifdef CONFIG_PPC64 DEFINE(AUDITCONTEXT, offsetof(struct task_struct, audit_context)); + DEFINE(SIGSEGV, SIGSEGV); + DEFINE(NMI_MASK, NMI_MASK); #else DEFINE(THREAD_INFO, offsetof(struct task_struct, stack)); #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index eb898112e57..8ac85e08ffa 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -729,6 +729,11 @@ BEGIN_FTR_SECTION bne- do_ste_alloc /* If so handle it */ END_FTR_SECTION_IFCLR(CPU_FTR_SLB) + clrrdi r11,r1,THREAD_SHIFT + lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ + andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */ + bne 77f /* then don't call hash_page now */ + /* * On iSeries, we soft-disable interrupts here, then * hard-enable interrupts so that the hash_page code can spin on @@ -833,6 +838,20 @@ handle_page_fault: bl .low_hash_fault b .ret_from_except +/* + * We come here as a result of a DSI at a point where we don't want + * to call hash_page, such as when we are accessing memory (possibly + * user memory) inside a PMU interrupt that occurred while interrupts + * were soft-disabled. We want to invoke the exception handler for + * the access, or panic if there isn't a handler. + */ +77: bl .save_nvgprs + mr r4,r3 + addi r3,r1,STACK_FRAME_OVERHEAD + li r5,SIGSEGV + bl .bad_page_fault + b .ret_from_except + /* here we have a segment miss */ do_ste_alloc: bl .ste_allocate /* try to insert stab entry */ diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c new file mode 100644 index 00000000000..f74b62c6751 --- /dev/null +++ b/arch/powerpc/kernel/perf_callchain.c @@ -0,0 +1,527 @@ +/* + * Performance counter callchain support - powerpc architecture code + * + * Copyright © 2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/perf_counter.h> +#include <linux/percpu.h> +#include <linux/uaccess.h> +#include <linux/mm.h> +#include <asm/ptrace.h> +#include <asm/pgtable.h> +#include <asm/sigcontext.h> +#include <asm/ucontext.h> +#include <asm/vdso.h> +#ifdef CONFIG_PPC64 +#include "ppc32.h" +#endif + +/* + * Store another value in a callchain_entry. + */ +static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip) +{ + unsigned int nr = entry->nr; + + if (nr < PERF_MAX_STACK_DEPTH) { + entry->ip[nr] = ip; + entry->nr = nr + 1; + } +} + +/* + * Is sp valid as the address of the next kernel stack frame after prev_sp? + * The next frame may be in a different stack area but should not go + * back down in the same stack area. + */ +static int valid_next_sp(unsigned long sp, unsigned long prev_sp) +{ + if (sp & 0xf) + return 0; /* must be 16-byte aligned */ + if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD)) + return 0; + if (sp >= prev_sp + STACK_FRAME_OVERHEAD) + return 1; + /* + * sp could decrease when we jump off an interrupt stack + * back to the regular process stack. + */ + if ((sp & ~(THREAD_SIZE - 1)) != (prev_sp & ~(THREAD_SIZE - 1))) + return 1; + return 0; +} + +static void perf_callchain_kernel(struct pt_regs *regs, + struct perf_callchain_entry *entry) +{ + unsigned long sp, next_sp; + unsigned long next_ip; + unsigned long lr; + long level = 0; + unsigned long *fp; + + lr = regs->link; + sp = regs->gpr[1]; + callchain_store(entry, PERF_CONTEXT_KERNEL); + callchain_store(entry, regs->nip); + + if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD)) + return; + + for (;;) { + fp = (unsigned long *) sp; + next_sp = fp[0]; + + if (next_sp == sp + STACK_INT_FRAME_SIZE && + fp[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { + /* + * This looks like an interrupt frame for an + * interrupt that occurred in the kernel + */ + regs = (struct pt_regs *)(sp + STACK_FRAME_OVERHEAD); + next_ip = regs->nip; + lr = regs->link; + level = 0; + callchain_store(entry, PERF_CONTEXT_KERNEL); + + } else { + if (level == 0) + next_ip = lr; + else + next_ip = fp[STACK_FRAME_LR_SAVE]; + + /* + * We can't tell which of the first two addresses + * we get are valid, but we can filter out the + * obviously bogus ones here. We replace them + * with 0 rather than removing them entirely so + * that userspace can tell which is which. + */ + if ((level == 1 && next_ip == lr) || + (level <= 1 && !kernel_text_address(next_ip))) + next_ip = 0; + + ++level; + } + + callchain_store(entry, next_ip); + if (!valid_next_sp(next_sp, sp)) + return; + sp = next_sp; + } +} + +#ifdef CONFIG_PPC64 + +#ifdef CONFIG_HUGETLB_PAGE +#define is_huge_psize(pagesize) (HPAGE_SHIFT && mmu_huge_psizes[pagesize]) +#else +#define is_huge_psize(pagesize) 0 +#endif + +/* + * On 64-bit we don't want to invoke hash_page on user addresses from + * interrupt context, so if the access faults, we read the page tables + * to find which page (if any) is mapped and access it directly. + */ +static int read_user_stack_slow(void __user *ptr, void *ret, int nb) +{ + pgd_t *pgdir; + pte_t *ptep, pte; + int pagesize; + unsigned long addr = (unsigned long) ptr; + unsigned long offset; + unsigned long pfn; + void *kaddr; + + pgdir = current->mm->pgd; + if (!pgdir) + return -EFAULT; + + pagesize = get_slice_psize(current->mm, addr); + + /* align address to page boundary */ + offset = addr & ((1ul << mmu_psize_defs[pagesize].shift) - 1); + addr -= offset; + + if (is_huge_psize(pagesize)) + ptep = huge_pte_offset(current->mm, addr); + else + ptep = find_linux_pte(pgdir, addr); + + if (ptep == NULL) + return -EFAULT; + pte = *ptep; + if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER)) + return -EFAULT; + pfn = pte_pfn(pte); + if (!page_is_ram(pfn)) + return -EFAULT; + + /* no highmem to worry about here */ + kaddr = pfn_to_kaddr(pfn); + memcpy(ret, kaddr + offset, nb); + return 0; +} + +static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret) +{ + if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) || + ((unsigned long)ptr & 7)) + return -EFAULT; + + if (!__get_user_inatomic(*ret, ptr)) + return 0; + + return read_user_stack_slow(ptr, ret, 8); +} + +static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) +{ + if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || + ((unsigned long)ptr & 3)) + return -EFAULT; + + if (!__get_user_inatomic(*ret, ptr)) + return 0; + + return read_user_stack_slow(ptr, ret, 4); +} + +static inline int valid_user_sp(unsigned long sp, int is_64) +{ + if (!sp || (sp & 7) || sp > (is_64 ? TASK_SIZE : 0x100000000UL) - 32) + return 0; + return 1; +} + +/* + * 64-bit user processes use the same stack frame for RT and non-RT signals. + */ +struct signal_frame_64 { + char dummy[__SIGNAL_FRAMESIZE]; + struct ucontext uc; + unsigned long unused[2]; + unsigned int tramp[6]; + struct siginfo *pinfo; + void *puc; + struct siginfo info; + char abigap[288]; +}; + +static int is_sigreturn_64_address(unsigned long nip, unsigned long fp) +{ + if (nip == fp + offsetof(struct signal_frame_64, tramp)) + return 1; + if (vdso64_rt_sigtramp && current->mm->context.vdso_base && + nip == current->mm->context.vdso_base + vdso64_rt_sigtramp) + return 1; + return 0; +} + +/* + * Do some sanity checking on the signal frame pointed to by sp. + * We check the pinfo and puc pointers in the frame. + */ +static int sane_signal_64_frame(unsigned long sp) +{ + struct signal_frame_64 __user *sf; + unsigned long pinfo, puc; + + sf = (struct signal_frame_64 __user *) sp; + if (read_user_stack_64((unsigned long __user *) &sf->pinfo, &pinfo) || + read_user_stack_64((unsigned long __user *) &sf->puc, &puc)) + return 0; + return pinfo == (unsigned long) &sf->info && + puc == (unsigned long) &sf->uc; +} + +static void perf_callchain_user_64(struct pt_regs *regs, + struct perf_callchain_entry *entry) +{ + unsigned long sp, next_sp; + unsigned long next_ip; + unsigned long lr; + long level = 0; + struct signal_frame_64 __user *sigframe; + unsigned long __user *fp, *uregs; + + next_ip = regs->nip; + lr = regs->link; + sp = regs->gpr[1]; + callchain_store(entry, PERF_CONTEXT_USER); + callchain_store(entry, next_ip); + + for (;;) { + fp = (unsigned long __user *) sp; + if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp)) + return; + if (level > 0 && read_user_stack_64(&fp[2], &next_ip)) + return; + + /* + * Note: the next_sp - sp >= signal frame size check + * is true when next_sp < sp, which can happen when + * transitioning from an alternate signal stack to the + * normal stack. + */ + if (next_sp - sp >= sizeof(struct signal_frame_64) && + (is_sigreturn_64_address(next_ip, sp) || + (level <= 1 && is_sigreturn_64_address(lr, sp))) && + sane_signal_64_frame(sp)) { + /* + * This looks like an signal frame + */ + sigframe = (struct signal_frame_64 __user *) sp; + uregs = sigframe->uc.uc_mcontext.gp_regs; + if (read_user_stack_64(&uregs[PT_NIP], &next_ip) || + read_user_stack_64(&uregs[PT_LNK], &lr) || + read_user_stack_64(&uregs[PT_R1], &sp)) + return; + level = 0; + callchain_store(entry, PERF_CONTEXT_USER); + callchain_store(entry, next_ip); + continue; + } + + if (level == 0) + next_ip = lr; + callchain_store(entry, next_ip); + ++level; + sp = next_sp; + } +} + +static inline int current_is_64bit(void) +{ + /* + * We can't use test_thread_flag() here because we may be on an + * interrupt stack, and the thread flags don't get copied over + * from the thread_info on the main stack to the interrupt stack. + */ + return !test_ti_thread_flag(task_thread_info(current), TIF_32BIT); +} + +#else /* CONFIG_PPC64 */ +/* + * On 32-bit we just access the address and let hash_page create a + * HPTE if necessary, so there is no need to fall back to reading + * the page tables. Since this is called at interrupt level, + * do_page_fault() won't treat a DSI as a page fault. + */ +static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) +{ + if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || + ((unsigned long)ptr & 3)) + return -EFAULT; + + return __get_user_inatomic(*ret, ptr); +} + +static inline void perf_callchain_user_64(struct pt_regs *regs, + struct perf_callchain_entry *entry) +{ +} + +static inline int current_is_64bit(void) +{ + return 0; +} + +static inline int valid_user_sp(unsigned long sp, int is_64) +{ + if (!sp || (sp & 7) || sp > TASK_SIZE - 32) + return 0; + return 1; +} + +#define __SIGNAL_FRAMESIZE32 __SIGNAL_FRAMESIZE +#define sigcontext32 sigcontext +#define mcontext32 mcontext +#define ucontext32 ucontext +#define compat_siginfo_t struct siginfo + +#endif /* CONFIG_PPC64 */ + +/* + * Layout for non-RT signal frames + */ +struct signal_frame_32 { + char dummy[__SIGNAL_FRAMESIZE32]; + struct sigcontext32 sctx; + struct mcontext32 mctx; + int abigap[56]; +}; + +/* + * Layout for RT signal frames + */ +struct rt_signal_frame_32 { + char dummy[__SIGNAL_FRAMESIZE32 + 16]; + compat_siginfo_t info; + struct ucontext32 uc; + int abigap[56]; +}; + +static int is_sigreturn_32_address(unsigned int nip, unsigned int fp) +{ + if (nip == fp + offsetof(struct signal_frame_32, mctx.mc_pad)) + return 1; + if (vdso32_sigtramp && current->mm->context.vdso_base && + nip == current->mm->context.vdso_base + vdso32_sigtramp) + return 1; + return 0; +} + +static int is_rt_sigreturn_32_address(unsigned int nip, unsigned int fp) +{ + if (nip == fp + offsetof(struct rt_signal_frame_32, + uc.uc_mcontext.mc_pad)) + return 1; + if (vdso32_rt_sigtramp && current->mm->context.vdso_base && + nip == current->mm->context.vdso_base + vdso32_rt_sigtramp) + return 1; + return 0; +} + +static int sane_signal_32_frame(unsigned int sp) +{ + struct signal_frame_32 __user *sf; + unsigned int regs; + + sf = (struct signal_frame_32 __user *) (unsigned long) sp; + if (read_user_stack_32((unsigned int __user *) &sf->sctx.regs, ®s)) + return 0; + return regs == (unsigned long) &sf->mctx; +} + +static int sane_rt_signal_32_frame(unsigned int sp) +{ + struct rt_signal_frame_32 __user *sf; + unsigned int regs; + + sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp; + if (read_user_stack_32((unsigned int __user *) &sf->uc.uc_regs, ®s)) + return 0; + return regs == (unsigned long) &sf->uc.uc_mcontext; +} + +static unsigned int __user *signal_frame_32_regs(unsigned int sp, + unsigned int next_sp, unsigned int next_ip) +{ + struct mcontext32 __user *mctx = NULL; + struct signal_frame_32 __user *sf; + struct rt_signal_frame_32 __user *rt_sf; + + /* + * Note: the next_sp - sp >= signal frame size check + * is true when next_sp < sp, for example, when + * transitioning from an alternate signal stack to the + * normal stack. + */ + if (next_sp - sp >= sizeof(struct signal_frame_32) && + is_sigreturn_32_address(next_ip, sp) && + sane_signal_32_frame(sp)) { + sf = (struct signal_frame_32 __user *) (unsigned long) sp; + mctx = &sf->mctx; + } + + if (!mctx && next_sp - sp >= sizeof(struct rt_signal_frame_32) && + is_rt_sigreturn_32_address(next_ip, sp) && + sane_rt_signal_32_frame(sp)) { + rt_sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp; + mctx = &rt_sf->uc.uc_mcontext; + } + + if (!mctx) + return NULL; + return mctx->mc_gregs; +} + +static void perf_callchain_user_32(struct pt_regs *regs, + struct perf_callchain_entry *entry) +{ + unsigned int sp, next_sp; + unsigned int next_ip; + unsigned int lr; + long level = 0; + unsigned int __user *fp, *uregs; + + next_ip = regs->nip; + lr = regs->link; + sp = regs->gpr[1]; + callchain_store(entry, PERF_CONTEXT_USER); + callchain_store(entry, next_ip); + + while (entry->nr < PERF_MAX_STACK_DEPTH) { + fp = (unsigned int __user *) (unsigned long) sp; + if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp)) + return; + if (level > 0 && read_user_stack_32(&fp[1], &next_ip)) + return; + + uregs = signal_frame_32_regs(sp, next_sp, next_ip); + if (!uregs && level <= 1) + uregs = signal_frame_32_regs(sp, next_sp, lr); + if (uregs) { + /* + * This looks like an signal frame, so restart + * the stack trace with the values in it. + */ + if (read_user_stack_32(&uregs[PT_NIP], &next_ip) || + read_user_stack_32(&uregs[PT_LNK], &lr) || + read_user_stack_32(&uregs[PT_R1], &sp)) + return; + level = 0; + callchain_store(entry, PERF_CONTEXT_USER); + callchain_store(entry, next_ip); + continue; + } + + if (level == 0) + next_ip = lr; + callchain_store(entry, next_ip); + ++level; + sp = next_sp; + } +} + +/* + * Since we can't get PMU interrupts inside a PMU interrupt handler, + * we don't need separate irq and nmi entries here. + */ +static DEFINE_PER_CPU(struct perf_callchain_entry, callchain); + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + struct perf_callchain_entry *entry = &__get_cpu_var(callchain); + + entry->nr = 0; + + if (current->pid == 0) /* idle task? */ + return entry; + + if (!user_mode(regs)) { + perf_callchain_kernel(regs, entry); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + if (current_is_64bit()) + perf_callchain_user_64(regs, entry); + else + perf_callchain_user_32(regs, entry); + } + + return entry; +} diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 5b7038f248b..a685652effe 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -92,15 +92,13 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize, : "memory" ); } -void slb_flush_and_rebolt(void) +static void __slb_flush_and_rebolt(void) { /* If you change this make sure you change SLB_NUM_BOLTED * appropriately too. */ unsigned long linear_llp, vmalloc_llp, lflags, vflags; unsigned long ksp_esid_data, ksp_vsid_data; - WARN_ON(!irqs_disabled()); - linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; lflags = SLB_VSID_KERNEL | linear_llp; @@ -117,12 +115,6 @@ void slb_flush_and_rebolt(void) ksp_vsid_data = get_slb_shadow()->save_area[2].vsid; } - /* - * We can't take a PMU exception in the following code, so hard - * disable interrupts. - */ - hard_irq_disable(); - /* We need to do this all in asm, so we're sure we don't touch * the stack between the slbia and rebolting it. */ asm volatile("isync\n" @@ -139,6 +131,21 @@ void slb_flush_and_rebolt(void) : "memory"); } +void slb_flush_and_rebolt(void) +{ + + WARN_ON(!irqs_disabled()); + + /* + * We can't take a PMU exception in the following code, so hard + * disable interrupts. + */ + hard_irq_disable(); + + __slb_flush_and_rebolt(); + get_paca()->slb_cache_ptr = 0; +} + void slb_vmalloc_update(void) { unsigned long vflags; @@ -180,12 +187,20 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2) /* Flush all user entries from the segment table of the current processor. */ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) { - unsigned long offset = get_paca()->slb_cache_ptr; + unsigned long offset; unsigned long slbie_data = 0; unsigned long pc = KSTK_EIP(tsk); unsigned long stack = KSTK_ESP(tsk); unsigned long unmapped_base; + /* + * We need interrupts hard-disabled here, not just soft-disabled, + * so that a PMU interrupt can't occur, which might try to access + * user memory (to get a stack trace) and possible cause an SLB miss + * which would update the slb_cache/slb_cache_ptr fields in the PACA. + */ + hard_irq_disable(); + offset = get_paca()->slb_cache_ptr; if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) && offset <= SLB_CACHE_ENTRIES) { int i; @@ -200,7 +215,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) } asm volatile("isync" : : : "memory"); } else { - slb_flush_and_rebolt(); + __slb_flush_and_rebolt(); } /* Workaround POWER5 < DD2.1 issue */ diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c index 98cd1dc2ae7..ab5fb48b3e9 100644 --- a/arch/powerpc/mm/stab.c +++ b/arch/powerpc/mm/stab.c @@ -164,7 +164,7 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm) { struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr; struct stab_entry *ste; - unsigned long offset = __get_cpu_var(stab_cache_ptr); + unsigned long offset; unsigned long pc = KSTK_EIP(tsk); unsigned long stack = KSTK_ESP(tsk); unsigned long unmapped_base; @@ -172,6 +172,15 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm) /* Force previous translations to complete. DRENG */ asm volatile("isync" : : : "memory"); + /* + * We need interrupts hard-disabled here, not just soft-disabled, + * so that a PMU interrupt can't occur, which might try to access + * user memory (to get a stack trace) and possible cause an STAB miss + * which would update the stab_cache/stab_cache_ptr per-cpu variables. + */ + hard_irq_disable(); + + offset = __get_cpu_var(stab_cache_ptr); if (offset <= NR_STAB_CACHE_ENTRIES) { int i; diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index b53f7006cc4..e022b847c90 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -216,6 +216,7 @@ struct perf_counter_attr { #define PERF_COUNTER_IOC_REFRESH _IO ('$', 2) #define PERF_COUNTER_IOC_RESET _IO ('$', 3) #define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64) +#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5) enum perf_counter_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, @@ -415,6 +416,9 @@ enum perf_callchain_context { PERF_CONTEXT_MAX = (__u64)-4095, }; +#define PERF_FLAG_FD_NO_GROUP (1U << 0) +#define PERF_FLAG_FD_OUTPUT (1U << 1) + #ifdef __KERNEL__ /* * Kernel-internal data types and definitions: @@ -536,6 +540,7 @@ struct perf_counter { struct list_head sibling_list; int nr_siblings; struct perf_counter *group_leader; + struct perf_counter *output; const struct pmu *pmu; enum perf_counter_active_state state; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 534e20d14d6..53abcbefa0b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -469,7 +469,8 @@ static void update_counter_times(struct perf_counter *counter) struct perf_counter_context *ctx = counter->ctx; u64 run_end; - if (counter->state < PERF_COUNTER_STATE_INACTIVE) + if (counter->state < PERF_COUNTER_STATE_INACTIVE || + counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE) return; counter->total_time_enabled = ctx->time - counter->tstamp_enabled; @@ -518,7 +519,7 @@ static void __perf_counter_disable(void *info) */ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { update_context_time(ctx); - update_counter_times(counter); + update_group_times(counter); if (counter == counter->group_leader) group_sched_out(counter, cpuctx, ctx); else @@ -573,7 +574,7 @@ static void perf_counter_disable(struct perf_counter *counter) * in, so we can change the state safely. */ if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_counter_times(counter); + update_group_times(counter); counter->state = PERF_COUNTER_STATE_OFF; } @@ -851,6 +852,27 @@ retry: } /* + * Put a counter into inactive state and update time fields. + * Enabling the leader of a group effectively enables all + * the group members that aren't explicitly disabled, so we + * have to update their ->tstamp_enabled also. + * Note: this works for group members as well as group leaders + * since the non-leader members' sibling_lists will be empty. + */ +static void __perf_counter_mark_enabled(struct perf_counter *counter, + struct perf_counter_context *ctx) +{ + struct perf_counter *sub; + + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = ctx->time - counter->total_time_enabled; + list_for_each_entry(sub, &counter->sibling_list, list_entry) + if (sub->state >= PERF_COUNTER_STATE_INACTIVE) + sub->tstamp_enabled = + ctx->time - sub->total_time_enabled; +} + +/* * Cross CPU call to enable a performance counter */ static void __perf_counter_enable(void *info) @@ -877,8 +899,7 @@ static void __perf_counter_enable(void *info) if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time - counter->total_time_enabled; + __perf_counter_mark_enabled(counter, ctx); /* * If the counter is in a group and isn't the group leader, @@ -971,11 +992,9 @@ static void perf_counter_enable(struct perf_counter *counter) * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_OFF) { - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = - ctx->time - counter->total_time_enabled; - } + if (counter->state == PERF_COUNTER_STATE_OFF) + __perf_counter_mark_enabled(counter, ctx); + out: spin_unlock_irq(&ctx->lock); } @@ -1479,9 +1498,7 @@ static void perf_counter_enable_on_exec(struct task_struct *task) counter->attr.enable_on_exec = 0; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) continue; - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = - ctx->time - counter->total_time_enabled; + __perf_counter_mark_enabled(counter, ctx); enabled = 1; } @@ -1503,10 +1520,21 @@ static void perf_counter_enable_on_exec(struct task_struct *task) */ static void __perf_counter_read(void *info) { + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; unsigned long flags; + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. In that case + * counter->count would have been updated to a recent sample + * when the counter was scheduled out. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + local_irq_save(flags); if (ctx->is_active) update_context_time(ctx); @@ -1664,6 +1692,11 @@ static void free_counter(struct perf_counter *counter) atomic_dec(&nr_task_counters); } + if (counter->output) { + fput(counter->output->filp); + counter->output = NULL; + } + if (counter->destroy) counter->destroy(counter); @@ -1780,7 +1813,7 @@ static int perf_counter_read_group(struct perf_counter *counter, size += err; list_for_each_entry(sub, &leader->sibling_list, list_entry) { - err = perf_counter_read_entry(counter, read_format, + err = perf_counter_read_entry(sub, read_format, buf + size); if (err < 0) return err; @@ -1949,6 +1982,8 @@ unlock: return ret; } +int perf_counter_set_output(struct perf_counter *counter, int output_fd); + static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct perf_counter *counter = file->private_data; @@ -1972,6 +2007,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_COUNTER_IOC_PERIOD: return perf_counter_period(counter, (u64 __user *)arg); + case PERF_COUNTER_IOC_SET_OUTPUT: + return perf_counter_set_output(counter, arg); + default: return -ENOTTY; } @@ -2008,6 +2046,10 @@ int perf_counter_task_disable(void) return 0; } +#ifndef PERF_COUNTER_INDEX_OFFSET +# define PERF_COUNTER_INDEX_OFFSET 0 +#endif + static int perf_counter_index(struct perf_counter *counter) { if (counter->state != PERF_COUNTER_STATE_ACTIVE) @@ -2238,6 +2280,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->mmap_mutex); + if (counter->output) { + ret = -EINVAL; + goto unlock; + } + if (atomic_inc_not_zero(&counter->mmap_count)) { if (nr_pages != counter->data->nr_pages) ret = -EINVAL; @@ -2623,6 +2670,7 @@ static int perf_output_begin(struct perf_output_handle *handle, struct perf_counter *counter, unsigned int size, int nmi, int sample) { + struct perf_counter *output_counter; struct perf_mmap_data *data; unsigned int offset, head; int have_lost; @@ -2632,13 +2680,17 @@ static int perf_output_begin(struct perf_output_handle *handle, u64 lost; } lost_event; + rcu_read_lock(); /* * For inherited counters we send all the output towards the parent. */ if (counter->parent) counter = counter->parent; - rcu_read_lock(); + output_counter = rcu_dereference(counter->output); + if (output_counter) + counter = output_counter; + data = rcu_dereference(counter->data); if (!data) goto out; @@ -4186,6 +4238,57 @@ err_size: goto out; } +int perf_counter_set_output(struct perf_counter *counter, int output_fd) +{ + struct perf_counter *output_counter = NULL; + struct file *output_file = NULL; + struct perf_counter *old_output; + int fput_needed = 0; + int ret = -EINVAL; + + if (!output_fd) + goto set; + + output_file = fget_light(output_fd, &fput_needed); + if (!output_file) + return -EBADF; + + if (output_file->f_op != &perf_fops) + goto out; + + output_counter = output_file->private_data; + + /* Don't chain output fds */ + if (output_counter->output) + goto out; + + /* Don't set an output fd when we already have an output channel */ + if (counter->data) + goto out; + + atomic_long_inc(&output_file->f_count); + +set: + mutex_lock(&counter->mmap_mutex); + old_output = counter->output; + rcu_assign_pointer(counter->output, output_counter); + mutex_unlock(&counter->mmap_mutex); + + if (old_output) { + /* + * we need to make sure no existing perf_output_*() + * is still referencing this counter. + */ + synchronize_rcu(); + fput(old_output->filp); + } + + ret = 0; +out: + fput_light(output_file, fput_needed); + return ret; +} + /** * sys_perf_counter_open - open a performance counter, associate it to a task/cpu * @@ -4208,7 +4311,7 @@ SYSCALL_DEFINE5(perf_counter_open, int ret; /* for future expandability... */ - if (flags) + if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) return -EINVAL; ret = perf_copy_attr(attr_uptr, &attr); @@ -4236,7 +4339,7 @@ SYSCALL_DEFINE5(perf_counter_open, * Look up the group leader (we will attach this counter to it): */ group_leader = NULL; - if (group_fd != -1) { + if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { ret = -EINVAL; group_file = fget_light(group_fd, &fput_needed); if (!group_file) @@ -4278,6 +4381,12 @@ SYSCALL_DEFINE5(perf_counter_open, if (!counter_file) goto err_free_put_context; + if (flags & PERF_FLAG_FD_OUTPUT) { + ret = perf_counter_set_output(counter, group_fd); + if (ret) + goto err_free_put_context; + } + counter->filp = counter_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile index 5457192e1b4..bdd3b7ecad0 100644 --- a/tools/perf/Documentation/Makefile +++ b/tools/perf/Documentation/Makefile @@ -35,7 +35,7 @@ man7dir=$(mandir)/man7 # DESTDIR= ASCIIDOC=asciidoc -ASCIIDOC_EXTRA = +ASCIIDOC_EXTRA = --unsafe MANPAGE_XSL = manpage-normal.xsl XMLTO_EXTRA = INSTALL?=install diff --git a/tools/perf/Documentation/perf-examples.txt b/tools/perf/Documentation/examples.txt index 8eb6c489fb1..8eb6c489fb1 100644 --- a/tools/perf/Documentation/perf-examples.txt +++ b/tools/perf/Documentation/examples.txt diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 96d421f7161..4c7bc443623 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -28,6 +28,7 @@ static char const *input_name = "perf.data"; static char default_sort_order[] = "comm,symbol"; static char *sort_order = default_sort_order; +static int force; static int input; static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV; @@ -629,6 +630,13 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head) (void *)(long)(event->header.size), event->fork.pid, event->fork.ppid); + /* + * A thread clone will have the same PID for both + * parent and child. + */ + if (thread == parent) + return 0; + if (!thread || !parent || thread__fork(thread, parent)) { dump_printf("problem processing PERF_EVENT_FORK, skipping event.\n"); return -1; @@ -976,6 +984,11 @@ static int __cmd_annotate(void) exit(-1); } + if (!force && input_stat.st_uid && (input_stat.st_uid != geteuid())) { + fprintf(stderr, "file: %s not owned by current user or root\n", input_name); + exit(-1); + } + if (!input_stat.st_size) { fprintf(stderr, "zero-sized file, nothing to do!\n"); exit(0); @@ -1081,6 +1094,7 @@ static const struct option options[] = { "input file name"), OPT_STRING('s', "symbol", &sym_hist_filter, "symbol", "symbol to annotate"), + OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index ed1fdab3a1f..cdd46ab11bd 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -37,6 +37,7 @@ static char *dso_list_str, *comm_list_str, *sym_list_str, static struct strlist *dso_list, *comm_list, *sym_list; static char *field_sep; +static int force; static int input; static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV; @@ -665,6 +666,27 @@ static void dso__calc_col_width(struct dso *self) self->slen_calculated = 1; } +static int thread__set_comm_adjust(struct thread *self, const char *comm) +{ + int ret = thread__set_comm(self, comm); + + if (ret) + return ret; + + if (!col_width_list_str && !field_sep && + (!comm_list || strlist__has_entry(comm_list, comm))) { + unsigned int slen = strlen(comm); + + if (slen > comms__col_width) { + comms__col_width = slen; + threads__col_width = slen + 6; + } + } + + return 0; +} + + static struct symbol * resolve_symbol(struct thread *thread, struct map **mapp, struct dso **dsop, u64 *ipp) @@ -1056,7 +1078,7 @@ static void register_idle_thread(void) struct thread *thread = threads__findnew(0, &threads, &last_match); if (thread == NULL || - thread__set_comm(thread, "[idle]")) { + thread__set_comm_adjust(thread, "[idle]")) { fprintf(stderr, "problem inserting idle task.\n"); exit(-1); } @@ -1226,7 +1248,7 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head) event->comm.comm, event->comm.pid); if (thread == NULL || - thread__set_comm(thread, event->comm.comm)) { + thread__set_comm_adjust(thread, event->comm.comm)) { dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n"); return -1; } @@ -1383,6 +1405,11 @@ static int __cmd_report(void) exit(-1); } + if (!force && input_stat.st_uid && (input_stat.st_uid != geteuid())) { + fprintf(stderr, "file: %s not owned by current user or root\n", input_name); + exit(-1); + } + if (!input_stat.st_size) { fprintf(stderr, "zero-sized file, nothing to do!\n"); exit(0); @@ -1594,6 +1621,7 @@ static const struct option options[] = { OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"), + OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), OPT_BOOLEAN('m', "modules", &modules, "load module symbols - WARNING: use only with -k and LIVE kernel"), OPT_BOOLEAN('n', "show-nr-samples", &show_nr_samples, diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 62b55ecab2c..4002ccb3675 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -483,11 +483,16 @@ static void print_sym_table(void) if (nr_counters == 1) printf(" samples pcnt"); else - printf(" weight samples pcnt"); + printf(" weight samples pcnt"); - printf(" RIP kernel function\n" - " ______ _______ _____ ________________ _______________\n\n" - ); + if (verbose) + printf(" RIP "); + printf(" kernel function\n"); + printf(" %s _______ _____", + nr_counters == 1 ? " " : "______"); + if (verbose) + printf(" ________________"); + printf(" _______________\n\n"); for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) { struct symbol *sym; @@ -508,7 +513,9 @@ static void print_sym_table(void) printf("%9.1f %10ld - ", syme->weight, syme->snap_count); percent_color_fprintf(stdout, "%4.1f%%", pcnt); - printf(" - %016llx : %s", sym->start, sym->name); + if (verbose) + printf(" - %016llx", sym->start); + printf(" : %s", sym->name); if (sym->module) printf("\t[%s]", sym->module->name); printf("\n"); diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 00c14b98d65..f98032c135c 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -4,6 +4,7 @@ #include <string.h> #include "thread.h" #include "util.h" +#include "debug.h" static struct thread *thread__new(pid_t pid) { @@ -85,9 +86,27 @@ void thread__insert_map(struct thread *self, struct map *map) list_for_each_entry_safe(pos, tmp, &self->maps, node) { if (map__overlap(pos, map)) { - list_del_init(&pos->node); - /* XXX leaks dsos */ - free(pos); + if (verbose >= 2) { + printf("overlapping maps:\n"); + map__fprintf(map, stdout); + map__fprintf(pos, stdout); + } + + if (map->start <= pos->start && map->end > pos->start) + pos->start = map->end; + + if (map->end >= pos->end && map->start < pos->end) + pos->end = map->start; + + if (verbose >= 2) { + printf("after collision:\n"); + map__fprintf(pos, stdout); + } + + if (pos->start >= pos->end) { + list_del_init(&pos->node); + free(pos); + } } } diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index c62ef9720c6..9de2329dd44 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -39,10 +39,6 @@ /* Approximation of the length of the decimal representation of this type. */ #define decimal_length(x) ((int)(sizeof(x) * 2.56 + 0.5) + 1) -#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__USLC__) && !defined(_M_UNIX) -#define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */ -#define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */ -#endif #define _ALL_SOURCE 1 #define _GNU_SOURCE 1 #define _BSD_SOURCE 1 |