diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-15 11:26:35 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-15 11:26:35 -0800 |
commit | 83c2f912b43c3a7babbb6cb7ae2a5276c1ed2a3e (patch) | |
tree | eaa7f50dea154d9f19721db69c7adde64d48848f | |
parent | f0ed5b9a28536b8be2f578a9450cfa42ab31ccf8 (diff) | |
parent | 172d1b0b73256551f100fc00c69e356d047103f5 (diff) |
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (39 commits)
perf tools: Fix compile error on x86_64 Ubuntu
perf report: Fix --stdio output alignment when --showcpuutilization used
perf annotate: Get rid of field_sep check
perf annotate: Fix usage string
perf kmem: Fix a memory leak
perf kmem: Add missing closedir() calls
perf top: Add error message for EMFILE
perf test: Change type of '-v' option to INCR
perf script: Add missing closedir() calls
tracing: Fix compile error when static ftrace is enabled
recordmcount: Fix handling of elf64 big-endian objects.
perf tools: Add const.h to MANIFEST to make perf-tar-src-pkg work again
perf tools: Add support for guest/host-only profiling
perf kvm: Do guest-only counting by default
perf top: Don't update total_period on process_sample
perf hists: Stop using 'self' for struct hist_entry
perf hists: Rename total_session to total_period
x86: Add counter when debug stack is used with interrupts enabled
x86: Allow NMIs to hit breakpoints in i386
x86: Keep current stack in NMI breakpoints
...
29 files changed, 1262 insertions, 468 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index eb93fd0ec73..b29f3c41629 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2475,6 +2475,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted. stacktrace [FTRACE] Enabled the stack tracer on boot up. + stacktrace_filter=[function-list] + [FTRACE] Limit the functions that the stack tracer + will trace at boot up. function-list is a comma separated + list of functions. This list can be changed at run + time by the stack_trace_filter file in the debugfs + tracing directory. Note, this enables stack tracing + and the stacktrace above is not needed. + sti= [PARISC,HW] Format: <num> Set the STI (builtin display/keyboard on the HP-PARISC diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 078ad0caefc..b903d5ea394 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -101,6 +101,28 @@ extern void aout_dump_debugregs(struct user *dump); extern void hw_breakpoint_restore(void); +#ifdef CONFIG_X86_64 +DECLARE_PER_CPU(int, debug_stack_usage); +static inline void debug_stack_usage_inc(void) +{ + __get_cpu_var(debug_stack_usage)++; +} +static inline void debug_stack_usage_dec(void) +{ + __get_cpu_var(debug_stack_usage)--; +} +int is_debug_stack(unsigned long addr); +void debug_stack_set_zero(void); +void debug_stack_reset(void); +#else /* !X86_64 */ +static inline int is_debug_stack(unsigned long addr) { return 0; } +static inline void debug_stack_set_zero(void) { } +static inline void debug_stack_reset(void) { } +static inline void debug_stack_usage_inc(void) { } +static inline void debug_stack_usage_dec(void) { } +#endif /* X86_64 */ + + #endif /* __KERNEL__ */ #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 41935fadfdf..e95822d683f 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -35,6 +35,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in extern struct desc_ptr idt_descr; extern gate_desc idt_table[]; +extern struct desc_ptr nmi_idt_descr; +extern gate_desc nmi_idt_table[]; struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; @@ -307,6 +309,16 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit = (limit >> 16) & 0xf; } +#ifdef CONFIG_X86_64 +static inline void set_nmi_gate(int gate, void *addr) +{ + gate_desc s; + + pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); + write_idt_entry(nmi_idt_table, gate, &s); +} +#endif + static inline void _set_gate(int gate, unsigned type, void *addr, unsigned dpl, unsigned ist, unsigned seg) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 850f2963a42..d43cad74f16 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1021,6 +1021,8 @@ __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) nmi_idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); @@ -1085,6 +1087,26 @@ unsigned long kernel_eflags; */ DEFINE_PER_CPU(struct orig_ist, orig_ist); +static DEFINE_PER_CPU(unsigned long, debug_stack_addr); +DEFINE_PER_CPU(int, debug_stack_usage); + +int is_debug_stack(unsigned long addr) +{ + return __get_cpu_var(debug_stack_usage) || + (addr <= __get_cpu_var(debug_stack_addr) && + addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); +} + +void debug_stack_set_zero(void) +{ + load_idt((const struct desc_ptr *)&nmi_idt_descr); +} + +void debug_stack_reset(void) +{ + load_idt((const struct desc_ptr *)&idt_descr); +} + #else /* CONFIG_X86_64 */ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; @@ -1212,6 +1234,8 @@ void __cpuinit cpu_init(void) estacks += exception_stack_sizes[v]; oist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; + if (v == DEBUG_STACK-1) + per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; } } diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a20e1cb9dc8..940ba711fc2 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1480,62 +1480,214 @@ ENTRY(error_exit) CFI_ENDPROC END(error_exit) +/* + * Test if a given stack is an NMI stack or not. + */ + .macro test_in_nmi reg stack nmi_ret normal_ret + cmpq %\reg, \stack + ja \normal_ret + subq $EXCEPTION_STKSZ, %\reg + cmpq %\reg, \stack + jb \normal_ret + jmp \nmi_ret + .endm /* runs on exception stack */ ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 + /* + * We allow breakpoints in NMIs. If a breakpoint occurs, then + * the iretq it performs will take us out of NMI context. + * This means that we can have nested NMIs where the next + * NMI is using the top of the stack of the previous NMI. We + * can't let it execute because the nested NMI will corrupt the + * stack of the previous NMI. NMI handlers are not re-entrant + * anyway. + * + * To handle this case we do the following: + * Check the a special location on the stack that contains + * a variable that is set when NMIs are executing. + * The interrupted task's stack is also checked to see if it + * is an NMI stack. + * If the variable is not set and the stack is not the NMI + * stack then: + * o Set the special variable on the stack + * o Copy the interrupt frame into a "saved" location on the stack + * o Copy the interrupt frame into a "copy" location on the stack + * o Continue processing the NMI + * If the variable is set or the previous stack is the NMI stack: + * o Modify the "copy" location to jump to the repeate_nmi + * o return back to the first NMI + * + * Now on exit of the first NMI, we first clear the stack variable + * The NMI stack will tell any nested NMIs at that point that it is + * nested. Then we pop the stack normally with iret, and if there was + * a nested NMI that updated the copy interrupt stack frame, a + * jump will be made to the repeat_nmi code that will handle the second + * NMI. + */ + + /* Use %rdx as out temp variable throughout */ + pushq_cfi %rdx + + /* + * Check the special variable on the stack to see if NMIs are + * executing. + */ + cmp $1, -8(%rsp) + je nested_nmi + + /* + * Now test if the previous stack was an NMI stack. + * We need the double check. We check the NMI stack to satisfy the + * race when the first NMI clears the variable before returning. + * We check the variable because the first NMI could be in a + * breakpoint routine using a breakpoint stack. + */ + lea 6*8(%rsp), %rdx + test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi + +nested_nmi: + /* + * Do nothing if we interrupted the fixup in repeat_nmi. + * It's about to repeat the NMI handler, so we are fine + * with ignoring this one. + */ + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out + +1: + /* Set up the interrupted NMIs stack to jump to repeat_nmi */ + leaq -6*8(%rsp), %rdx + movq %rdx, %rsp + CFI_ADJUST_CFA_OFFSET 6*8 + pushq_cfi $__KERNEL_DS + pushq_cfi %rdx + pushfq_cfi + pushq_cfi $__KERNEL_CS + pushq_cfi $repeat_nmi + + /* Put stack back */ + addq $(11*8), %rsp + CFI_ADJUST_CFA_OFFSET -11*8 + +nested_nmi_out: + popq_cfi %rdx + + /* No need to check faults here */ + INTERRUPT_RETURN + +first_nmi: + /* + * Because nested NMIs will use the pushed location that we + * stored in rdx, we must keep that space available. + * Here's what our stack frame will look like: + * +-------------------------+ + * | original SS | + * | original Return RSP | + * | original RFLAGS | + * | original CS | + * | original RIP | + * +-------------------------+ + * | temp storage for rdx | + * +-------------------------+ + * | NMI executing variable | + * +-------------------------+ + * | Saved SS | + * | Saved Return RSP | + * | Saved RFLAGS | + * | Saved CS | + * | Saved RIP | + * +-------------------------+ + * | copied SS | + * | copied Return RSP | + * | copied RFLAGS | + * | copied CS | + * | copied RIP | + * +-------------------------+ + * | pt_regs | + * +-------------------------+ + * + * The saved RIP is used to fix up the copied RIP that a nested + * NMI may zero out. The original stack frame and the temp storage + * is also used by nested NMIs and can not be trusted on exit. + */ + /* Set the NMI executing variable on the stack. */ + pushq_cfi $1 + + /* Copy the stack frame to the Saved frame */ + .rept 5 + pushq_cfi 6*8(%rsp) + .endr + + /* Make another copy, this one may be modified by nested NMIs */ + .rept 5 + pushq_cfi 4*8(%rsp) + .endr + + /* Do not pop rdx, nested NMIs will corrupt it */ + movq 11*8(%rsp), %rdx + + /* + * Everything below this point can be preempted by a nested + * NMI if the first NMI took an exception. Repeated NMIs + * caused by an exception and nested NMI will start here, and + * can still be preempted by another NMI. + */ +restart_nmi: + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + /* + * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit + * as we should not be calling schedule in NMI context. + * Even with normal interrupts enabled. An NMI should not be + * setting NEED_RESCHED or anything that normal interrupts and + * exceptions might do. + */ call save_paranoid DEFAULT_FRAME 0 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp,%rdi movq $-1,%rsi call do_nmi -#ifdef CONFIG_TRACE_IRQFLAGS - /* paranoidexit; without TRACE_IRQS_OFF */ - /* ebx: no swapgs flag */ - DISABLE_INTERRUPTS(CLBR_NONE) testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore - testl $3,CS(%rsp) - jnz nmi_userspace nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_ALL 8 + /* Clear the NMI executing stack variable */ + movq $0, 10*8(%rsp) jmp irq_return -nmi_userspace: - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz nmi_swapgs - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz nmi_schedule - movl %ebx,%edx /* arg3: thread flags */ - ENABLE_INTERRUPTS(CLBR_NONE) - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - DISABLE_INTERRUPTS(CLBR_NONE) - jmp nmi_userspace -nmi_schedule: - ENABLE_INTERRUPTS(CLBR_ANY) - call schedule - DISABLE_INTERRUPTS(CLBR_ANY) - jmp nmi_userspace - CFI_ENDPROC -#else - jmp paranoid_exit CFI_ENDPROC -#endif END(nmi) + /* + * If an NMI hit an iret because of an exception or breakpoint, + * it can lose its NMI context, and a nested NMI may come in. + * In that case, the nested NMI will change the preempted NMI's + * stack to jump to here when it does the final iret. + */ +repeat_nmi: + INTR_FRAME + /* Update the stack variable to say we are still in NMI */ + movq $1, 5*8(%rsp) + + /* copy the saved stack back to copy stack */ + .rept 5 + pushq_cfi 4*8(%rsp) + .endr + + jmp restart_nmi + CFI_ENDPROC +end_repeat_nmi: + ENTRY(ignore_sysret) CFI_STARTPROC mov $-ENOSYS,%eax diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e11e39478a4..40f4eb3766d 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -417,6 +417,10 @@ ENTRY(phys_base) ENTRY(idt_table) .skip IDT_ENTRIES * 16 + .align L1_CACHE_BYTES +ENTRY(nmi_idt_table) + .skip IDT_ENTRIES * 16 + __PAGE_ALIGNED_BSS .align PAGE_SIZE ENTRY(empty_zero_page) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index e88f37b58dd..47acaf31916 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -405,9 +405,108 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) unknown_nmi_error(reason, regs); } +/* + * NMIs can hit breakpoints which will cause it to lose its + * NMI context with the CPU when the breakpoint does an iret. + */ +#ifdef CONFIG_X86_32 +/* + * For i386, NMIs use the same stack as the kernel, and we can + * add a workaround to the iret problem in C. Simply have 3 states + * the NMI can be in. + * + * 1) not running + * 2) executing + * 3) latched + * + * When no NMI is in progress, it is in the "not running" state. + * When an NMI comes in, it goes into the "executing" state. + * Normally, if another NMI is triggered, it does not interrupt + * the running NMI and the HW will simply latch it so that when + * the first NMI finishes, it will restart the second NMI. + * (Note, the latch is binary, thus multiple NMIs triggering, + * when one is running, are ignored. Only one NMI is restarted.) + * + * If an NMI hits a breakpoint that executes an iret, another + * NMI can preempt it. We do not want to allow this new NMI + * to run, but we want to execute it when the first one finishes. + * We set the state to "latched", and the first NMI will perform + * an cmpxchg on the state, and if it doesn't successfully + * reset the state to "not running" it will restart the next + * NMI. + */ +enum nmi_states { + NMI_NOT_RUNNING, + NMI_EXECUTING, + NMI_LATCHED, +}; +static DEFINE_PER_CPU(enum nmi_states, nmi_state); + +#define nmi_nesting_preprocess(regs) \ + do { \ + if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \ + __get_cpu_var(nmi_state) = NMI_LATCHED; \ + return; \ + } \ + nmi_restart: \ + __get_cpu_var(nmi_state) = NMI_EXECUTING; \ + } while (0) + +#define nmi_nesting_postprocess() \ + do { \ + if (cmpxchg(&__get_cpu_var(nmi_state), \ + NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \ + goto nmi_restart; \ + } while (0) +#else /* x86_64 */ +/* + * In x86_64 things are a bit more difficult. This has the same problem + * where an NMI hitting a breakpoint that calls iret will remove the + * NMI context, allowing a nested NMI to enter. What makes this more + * difficult is that both NMIs and breakpoints have their own stack. + * When a new NMI or breakpoint is executed, the stack is set to a fixed + * point. If an NMI is nested, it will have its stack set at that same + * fixed address that the first NMI had, and will start corrupting the + * stack. This is handled in entry_64.S, but the same problem exists with + * the breakpoint stack. + * + * If a breakpoint is being processed, and the debug stack is being used, + * if an NMI comes in and also hits a breakpoint, the stack pointer + * will be set to the same fixed address as the breakpoint that was + * interrupted, causing that stack to be corrupted. To handle this case, + * check if the stack that was interrupted is the debug stack, and if + * so, change the IDT so that new breakpoints will use the current stack + * and not switch to the fixed address. On return of the NMI, switch back + * to the original IDT. + */ +static DEFINE_PER_CPU(int, update_debug_stack); + +static inline void nmi_nesting_preprocess(struct pt_regs *regs) +{ + /* + * If we interrupted a breakpoint, it is possible that + * the nmi handler will have breakpoints too. We need to + * change the IDT such that breakpoints that happen here + * continue to use the NMI stack. + */ + if (unlikely(is_debug_stack(regs->sp))) { + debug_stack_set_zero(); + __get_cpu_var(update_debug_stack) = 1; + } +} + +static inline void nmi_nesting_postprocess(void) +{ + if (unlikely(__get_cpu_var(update_debug_stack))) + debug_stack_reset(); +} +#endif + dotraplinkage notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { + nmi_nesting_preprocess(regs); + nmi_enter(); inc_irq_stat(__nmi_count); @@ -416,6 +515,9 @@ do_nmi(struct pt_regs *regs, long error_code) default_do_nmi(regs); nmi_exit(); + + /* On i386, may loop back to preprocess */ + nmi_nesting_postprocess(); } void stop_nmi(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index fa1191fb679..482ec3af206 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -311,9 +311,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) == NOTIFY_STOP) return; + /* + * Let others (NMI) know that the debug stack is in use + * as we may switch to the interrupt stack. + */ + debug_stack_usage_inc(); preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); + debug_stack_usage_dec(); } #ifdef CONFIG_X86_64 @@ -406,6 +412,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) SIGTRAP) == NOTIFY_STOP) return; + /* + * Let others (NMI) know that the debug stack is in use + * as we may switch to the interrupt stack. + */ + debug_stack_usage_inc(); + /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); @@ -413,6 +425,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); preempt_conditional_cli(regs); + debug_stack_usage_dec(); return; } @@ -432,6 +445,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); + debug_stack_usage_dec(); return; } @@ -718,4 +732,10 @@ void __init trap_init(void) cpu_init(); x86_init.irqs.trap_init(); + +#ifdef CONFIG_X86_64 + memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); + set_nmi_gate(1, &debug); + set_nmi_gate(3, &int3); +#endif } diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 59e4028e833..3fd17c24922 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -50,6 +50,11 @@ # define inline inline __attribute__((always_inline)) # define __inline__ __inline__ __attribute__((always_inline)) # define __inline __inline __attribute__((always_inline)) +#else +/* A lot of inline functions can cause havoc with function tracing */ +# define inline inline notrace +# define __inline__ __inline__ notrace +# define __inline __inline notrace #endif #define __deprecated __attribute__((deprecated)) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 26eafcef75b..028e26f0bf0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -133,6 +133,8 @@ struct ftrace_func_command { int ftrace_arch_code_modify_prepare(void); int ftrace_arch_code_modify_post_process(void); +void ftrace_bug(int err, unsigned long ip); + struct seq_file; struct ftrace_probe_ops { @@ -161,7 +163,6 @@ extern int ftrace_text_reserved(void *start, void *end); enum { FTRACE_FL_ENABLED = (1 << 30), - FTRACE_FL_FREE = (1 << 31), }; #define FTRACE_FL_MASK (0x3UL << 30) @@ -172,10 +173,7 @@ struct dyn_ftrace { unsigned long ip; /* address of mcount call-site */ struct dyn_ftrace *freelist; }; - union { - unsigned long flags; - struct dyn_ftrace *newlist; - }; + unsigned long flags; struct dyn_arch_ftrace arch; }; @@ -190,6 +188,56 @@ void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); int register_ftrace_command(struct ftrace_func_command *cmd); int unregister_ftrace_command(struct ftrace_func_command *cmd); +enum { + FTRACE_UPDATE_CALLS = (1 << 0), + FTRACE_DISABLE_CALLS = (1 << 1), + FTRACE_UPDATE_TRACE_FUNC = (1 << 2), + FTRACE_START_FUNC_RET = (1 << 3), + FTRACE_STOP_FUNC_RET = (1 << 4), +}; + +enum { + FTRACE_UPDATE_IGNORE, + FTRACE_UPDATE_MAKE_CALL, + FTRACE_UPDATE_MAKE_NOP, +}; + +enum { + FTRACE_ITER_FILTER = (1 << 0), + FTRACE_ITER_NOTRACE = (1 << 1), + FTRACE_ITER_PRINTALL = (1 << 2), + FTRACE_ITER_DO_HASH = (1 << 3), + FTRACE_ITER_HASH = (1 << 4), + FTRACE_ITER_ENABLED = (1 << 5), +}; + +void arch_ftrace_update_code(int command); + +struct ftrace_rec_iter; + +struct ftrace_rec_iter *ftrace_rec_iter_start(void); +struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter); +struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter); + +int ftrace_update_record(struct dyn_ftrace *rec, int enable); +int ftrace_test_record(struct dyn_ftrace *rec, int enable); +void ftrace_run_stop_machine(int command); +int ftrace_location(unsigned long ip); + +extern ftrace_func_t ftrace_trace_function; + +int ftrace_regex_open(struct ftrace_ops *ops, int flag, + struct inode *inode, struct file *file); +ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos); +ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos); +loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin); +int ftrace_regex_release(struct inode *inode, struct file *file); + +void __init +ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable); + /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); extern int ftrace_dyn_arch_init(void *data); @@ -284,6 +332,25 @@ static inline int ftrace_text_reserved(void *start, void *end) { return 0; } + +/* + * Again users of functions that have ftrace_ops may not + * have them defined when ftrace is not enabled, but these + * functions may still be called. Use a macro instead of inline. + */ +#define ftrace_regex_open(ops, flag, inod, file) ({ -ENODEV; }) +#define ftrace_set_early_filter(ops, buf, enable) do { } while (0) + +static inline ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) { return -ENODEV; } +static inline ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) { return -ENODEV; } +static inline loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin) +{ + return -ENODEV; +} +static inline int +ftrace_regex_release(struct inode *inode, struct file *file) { return -ENODEV; } #endif /* CONFIG_DYNAMIC_FTRACE */ /* totally disable ftrace - can not re-enable after this */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b1e8943fed1..683d559a0ee 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -22,11 +22,13 @@ #include <linux/hardirq.h> #include <linux/kthread.h> #include <linux/uaccess.h> +#include <linux/bsearch.h> #include <linux/module.h> #include <linux/ftrace.h> #include <linux/sysctl.h> #include <linux/slab.h> #include <linux/ctype.h> +#include <linux/sort.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/rcupdate.h> @@ -947,13 +949,6 @@ struct ftrace_func_probe { struct rcu_head rcu; }; -enum { - FTRACE_ENABLE_CALLS = (1 << 0), - FTRACE_DISABLE_CALLS = (1 << 1), - FTRACE_UPDATE_TRACE_FUNC = (1 << 2), - FTRACE_START_FUNC_RET = (1 << 3), - FTRACE_STOP_FUNC_RET = (1 << 4), -}; struct ftrace_func_entry { struct hlist_node hlist; unsigned long ip; @@ -984,18 +979,19 @@ static struct ftrace_ops global_ops = { .filter_hash = EMPTY_HASH, }; -static struct dyn_ftrace *ftrace_new_addrs; - static DEFINE_MUTEX(ftrace_regex_lock); struct ftrace_page { struct ftrace_page *next; + struct dyn_ftrace *records; int index; - struct dyn_ftrace records[]; + int size; }; -#define ENTRIES_PER_PAGE \ - ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) +static struct ftrace_page *ftrace_new_pgs; + +#define ENTRY_SIZE sizeof(struct dyn_ftrace) +#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) /* estimate from running different kernels */ #define NR_TO_INIT 10000 @@ -1003,7 +999,10 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static struct dyn_ftrace *ftrace_free_records; +static bool ftrace_hash_empty(struct ftrace_hash *hash) +{ + return !hash || !hash->count; +} static struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) @@ -1013,7 +1012,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) struct hlist_head *hhd; struct hlist_node *n; - if (!hash->count) + if (ftrace_hash_empty(hash)) return NULL; if (hash->size_bits > 0) @@ -1157,7 +1156,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) return NULL; /* Empty hash? */ - if (!hash || !hash->count) + if (ftrace_hash_empty(hash)) return new_hash; size = 1 << hash->size_bits; @@ -1282,9 +1281,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) filter_hash = rcu_dereference_raw(ops->filter_hash); notrace_hash = rcu_dereference_raw(ops->notrace_hash); - if ((!filter_hash || !filter_hash->count || + if ((ftrace_hash_empty(filter_hash) || ftrace_lookup_ip(filter_hash, ip)) && - (!notrace_hash || !notrace_hash->count || + (ftrace_hash_empty(notrace_hash) || !ftrace_lookup_ip(notrace_hash, ip))) ret = 1; else @@ -1307,6 +1306,47 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) } \ } + +static int ftrace_cmp_recs(const void *a, const void *b) +{ + const struct dyn_ftrace *reca = a; + const struct dyn_ftrace *recb = b; + + if (reca->ip > recb->ip) + return 1; + if (reca->ip < recb->ip) + return -1; + return 0; +} + +/** + * ftrace_location - return true if the ip giving is a traced location + * @ip: the instruction pointer to check + * + * Returns 1 if @ip given is a pointer to a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +int ftrace_location(unsigned long ip) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + struct dyn_ftrace key; + + key.ip = ip; + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + if (rec) + return 1; + } + + return 0; +} + static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1336,7 +1376,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (filter_hash) { hash = ops->filter_hash; other_hash = ops->notrace_hash; - if (!hash || !hash->count) + if (ftrace_hash_empty(hash)) all = 1; } else { inc = !inc; @@ -1346,7 +1386,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * If the notrace hash has no items, * then there's nothing to do. */ - if (hash && !hash->count) + if (ftrace_hash_empty(hash)) return; } @@ -1363,8 +1403,8 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) match = 1; } else { - in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); - in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); + in_hash = !!ftrace_lookup_ip(hash, rec->ip); + in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); /* * @@ -1372,7 +1412,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (filter_hash && in_hash && !in_other_hash) match = 1; else if (!filter_hash && in_hash && - (in_other_hash || !other_hash->count)) + (in_other_hash || ftrace_hash_empty(other_hash))) match = 1; } if (!match) @@ -1406,40 +1446,12 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, __ftrace_hash_rec_update(ops, filter_hash, 1); } -static void ftrace_free_rec(struct dyn_ftrace *rec) -{ - rec->freelist = ftrace_free_records; - ftrace_free_records = rec; - rec->flags |= FTRACE_FL_FREE; -} - static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { - struct dyn_ftrace *rec; - - /* First check for freed records */ - if (ftrace_free_records) { - rec = ftrace_free_records; - - if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { - FTRACE_WARN_ON_ONCE(1); - ftrace_free_records = NULL; + if (ftrace_pages->index == ftrace_pages->size) { + /* We should have allocated enough */ + if (WARN_ON(!ftrace_pages->next)) return NULL; - } - - ftrace_free_records = rec->freelist; - memset(rec, 0, sizeof(*rec)); - return rec; - } - - if (ftrace_pages->index == ENTRIES_PER_PAGE) { - if (!ftrace_pages->next) { - /* allocate another page */ - ftrace_pages->next = - (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages->next) - return NULL; - } ftrace_pages = ftrace_pages->next; } @@ -1459,8 +1471,6 @@ ftrace_record_ip(unsigned long ip) return NULL; rec->ip = ip; - rec->newlist = ftrace_new_addrs; - ftrace_new_addrs = rec; return rec; } @@ -1475,7 +1485,19 @@ static void print_ip_ins(const char *fmt, unsigned char *p) printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); } -static void ftrace_bug(int failed, unsigned long ip) +/** + * ftrace_bug - report and shutdown function tracer + * @failed: The failed type (EFAULT, EINVAL, EPERM) + * @ip: The address that failed + * + * The arch code that enables or disables the function tracing + * can call ftrace_bug() when it has detected a problem in + * modifying the code. @failed should be one of either: + * EFAULT - if the problem happens on reading the @ip address + * EINVAL - if what is read at @ip is not what was expected + * EPERM - if the problem happens on writting to the @ip address + */ +void ftrace_bug(int failed, unsigned long ip) { switch (failed) { case -EFAULT: @@ -1517,24 +1539,19 @@ int ftrace_text_reserved(void *start, void *end) return 0; } - -static int -__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) { - unsigned long ftrace_addr; unsigned long flag = 0UL; - ftrace_addr = (unsigned long)FTRACE_ADDR; - /* - * If we are enabling tracing: + * If we are updating calls: * * If the record has a ref count, then we need to enable it * because someone is using it. * * Otherwise we make sure its disabled. * - * If we are disabling tracing, then disable all records that + * If we are disabling calls, then disable all records that * are enabled. */ if (enable && (rec->flags & ~FTRACE_FL_MASK)) @@ -1542,18 +1559,72 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) /* If the state of this record hasn't changed, then do nothing */ if ((rec->flags & FTRACE_FL_ENABLED) == flag) - return 0; + return FTRACE_UPDATE_IGNORE; if (flag) { - rec->flags |= FTRACE_FL_ENABLED; + if (update) + rec->flags |= FTRACE_FL_ENABLED; + return FTRACE_UPDATE_MAKE_CALL; + } + + if (update) + rec->flags &= ~FTRACE_FL_ENABLED; + + return FTRACE_UPDATE_MAKE_NOP; +} + +/** + * ftrace_update_record, set a record that now is tracing or not + * @rec: the record to update + * @enable: set to 1 if the record is tracing, zero to force disable + * + * The records that represent all functions that can be traced need + * to be updated when tracing has been enabled. + */ +int ftrace_update_record(struct dyn_ftrace *rec, int enable) +{ + return ftrace_check_record(rec, enable, 1); +} + +/** + * ftrace_test_record, check if the record has been enabled or not + * @rec: the record to test + * @enable: set to 1 to check if enabled, 0 if it is disabled + * + * The arch code may need to test if a record is already set to + * tracing to determine how to modify the function code that it + * represents. + */ +int ftrace_test_record(struct dyn_ftrace *rec, int enable) +{ + return ftrace_check_record(rec, enable, 0); +} + +static int +__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + ret = ftrace_update_record(rec, enable); + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: return ftrace_make_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + return ftrace_make_nop(NULL, rec, ftrace_addr); } - rec->flags &= ~FTRACE_FL_ENABLED; - return ftrace_make_nop(NULL, rec, ftrace_addr); + return -1; /* unknow ftrace bug */ } -static void ftrace_replace_code(int enable) +static void ftrace_replace_code(int update) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -1563,11 +1634,7 @@ static void ftrace_replace_code(int enable) return; do_for_each_ftrace_rec(pg, rec) { - /* Skip over free records */ - if (rec->flags & FTRACE_FL_FREE) - continue; - - failed = __ftrace_replace_code(rec, enable); + failed = __ftrace_replace_code(rec, update); if (failed) { ftrace_bug(failed, rec->ip); /* Stop processing */ @@ -1576,6 +1643,78 @@ static void ftrace_replace_code(int enable) } while_for_each_ftrace_rec(); } +struct ftrace_rec_iter { + struct ftrace_page *pg; + int index; +}; + +/** + * ftrace_rec_iter_start, start up iterating over traced functions + * + * Returns an iterator handle that is used to iterate over all + * the records that represent address locations where functions + * are traced. + * + * May return NULL if no records are available. + */ +struct ftrace_rec_iter *ftrace_rec_iter_start(void) +{ + /* + * We only use a single iterator. + * Protected by the ftrace_lock mutex. + */ + static struct ftrace_rec_iter ftrace_rec_iter; + struct ftrace_rec_iter *iter = &ftrace_rec_iter; + + iter->pg = ftrace_pages_start; + iter->index = 0; + + /* Could have empty pages */ + while (iter->pg && !iter->pg->index) + iter->pg = iter->pg->next; + + if (!iter->pg) + return NULL; + + return iter; +} + +/** + * ftrace_rec_iter_next, get the next record to process. + * @iter: The handle to the iterator. + * + * Returns the next iterator after the given iterator @iter. + */ +struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) +{ + iter->index++; + + if (iter->index >= iter->pg->index) { + iter->pg = iter->pg->next; + iter->index = 0; + + /* Could have empty pages */ + while (iter->pg && !iter->pg->index) + iter->pg = iter->pg->next; + } + + if (!iter->pg) + return NULL; + + return iter; +} + +/** + * ftrace_rec_iter_record, get the record at the iterator location + * @iter: The current iterator location + * + * Returns the record that the current @iter is at. + */ +struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) +{ + return &iter->pg->records[iter->index]; +} + static int ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) { @@ -1617,13 +1756,7 @@ static int __ftrace_modify_code(void *data) { int *command = data; - /* - * Do not call function tracer while we update the code. - * We are in stop machine, no worrying about races. - */ - function_trace_stop++; - - if (*command & FTRACE_ENABLE_CALLS) + if (*command & FTRACE_UPDATE_CALLS) ftrace_replace_code(1); else if (*command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); @@ -1636,21 +1769,33 @@ static int __ftrace_modify_code(void *data) else if (*command & FTRACE_STOP_FUNC_RET) ftrace_disable_ftrace_graph_caller(); -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - /* - * For archs that call ftrace_test_stop_func(), we must - * wait till after we update all the function callers - * before we update the callback. This keeps different - * ops that record different functions from corrupting - * each other. - */ - __ftrace_trace_function = __ftrace_trace_function_delay; -#endif - function_trace_stop--; - return 0; } +/** + * ftrace_run_stop_machine, go back to the stop machine method + * @command: The command to tell ftrace what to do + * + * If an arch needs to fall back to the stop machine method, the + * it can call this function. + */ +void ftrace_run_stop_machine(int command) +{ + stop_machine(__ftrace_modify_code, &command, NULL); +} + +/** + * arch_ftrace_update_code, modify the code to trace or not trace + * @command: The command that needs to be done + * + * Archs can override this function if it does not need to + * run stop_machine() to modify code. + */ +void __weak arch_ftrace_update_code(int command) +{ + ftrace_run_stop_machine(command); +} + static void ftrace_run_update_code(int command) { int ret; @@ -1659,8 +1804,31 @@ static void ftrace_run_update_code(int command) FTRACE_WARN_ON(ret); if (ret) return; + /* + * Do not call function tracer while we update the code. + * We are in stop machine. + */ + function_trace_stop++; - stop_machine(__ftrace_modify_code, &command, NULL); + /* + * By default we use stop_machine() to modify the code. + * But archs can do what ever they want as long as it + * is safe. The stop_machine() is the safest, but also + * produces the most overhead. + */ + arch_ftrace_update_code(command); + +#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + /* + * For archs that call ftrace_test_stop_func(), we must + * wait till after we update all the function callers + * before we update the callback. This keeps different + * ops that record different functions from corrupting + * each other. + */ + __ftrace_trace_function = __ftrace_trace_function_delay; +#endif + function_trace_stop--; ret = ftrace_arch_code_modify_post_process(); FTRACE_WARN_ON(ret); @@ -1691,7 +1859,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return -ENODEV; ftrace_start_up++; - command |= FTRACE_ENABLE_CALLS; + command |= FTRACE_UPDATE_CALLS; /* ops marked global share the filter hashes */ if (ops->flags & FTRACE_OPS_FL_GLOBAL) { @@ -1743,8 +1911,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command) if (ops != &global_ops || !global_start_up) ops->flags &= ~FTRACE_OPS_FL_ENABLED; - if (!ftrace_start_up) - command |= FTRACE_DISABLE_CALLS; + command |= FTRACE_UPDATE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; @@ -1766,7 +1933,7 @@ static void ftrace_startup_sysctl(void) saved_ftrace_func = NULL; /* ftrace_start_up is true if we want ftrace running */ if (ftrace_start_up) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + ftrace_run_update_code(FTRACE_UPDATE_CALLS); } static void ftrace_shutdown_sysctl(void) @@ -1788,14 +1955,16 @@ static int ops_traces_mod(struct ftrace_ops *ops) struct ftrace_hash *hash; hash = ops->filter_hash; - return !!(!hash || !hash->count); + return ftrace_hash_empty(hash); } static int ftrace_update_code(struct module *mod) { + struct ftrace_page *pg; struct dyn_ftrace *p; cycle_t start, stop; unsigned long ref = 0; + int i; /* * When adding a module, we need to check if tracers are @@ -1817,46 +1986,44 @@ static int ftrace_update_code(struct module *mod) start = ftrace_now(raw_smp_processor_id()); ftrace_update_cnt = 0; - while (ftrace_new_addrs) { + for (pg = ftrace_new_pgs; pg; pg = pg->next) { - /* If something went wrong, bail without enabling anything */ - if (unlikely(ftrace_disabled)) - return -1; + for (i = 0; i < pg->index; i++) { + /* If something went wrong, bail without enabling anything */ + if (unlikely(ftrace_disabled)) + return -1; - p = ftrace_new_addrs; - ftrace_new_addrs = p->newlist; - p->flags = ref; + p = &pg->records[i]; + p->flags = ref; - /* - * Do the initial record conversion from mcount jump - * to the NOP instructions. - */ - if (!ftrace_code_disable(mod, p)) { - ftrace_free_rec(p); - /* Game over */ - break; - } + /* + * Do the initial record conversion from mcount jump + * to the NOP instructions. + */ + if (!ftrace_code_disable(mod, p)) + break; - ftrace_update_cnt++; + ftrace_update_cnt++; - /* - * If the tracing is enabled, go ahead and enable the record. - * - * The reason not to enable the record immediatelly is the - * inherent check of ftrace_make_nop/ftrace_make_call for - * correct previous instructions. Making first the NOP - * conversion puts the module to the correct state, thus - * passing the ftrace_make_call check. - */ - if (ftrace_start_up && ref) { - int failed = __ftrace_replace_code(p, 1); - if (failed) { - ftrace_bug(failed, p->ip); - ftrace_free_rec(p); + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + */ + if (ftrace_start_up && ref) { + int failed = __ftrace_replace_code(p, 1); + if (failed) + ftrace_bug(failed, p->ip); } } } + ftrace_new_pgs = NULL; + stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; @@ -1864,57 +2031,108 @@ static int ftrace_update_code(struct module *mod) return 0; } -static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) +static int ftrace_allocate_records(struct ftrace_page *pg, int count) { - struct ftrace_page *pg; + int order; int cnt; - int i; - /* allocate a few pages */ - ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages_start) - return -1; + if (WARN_ON(!count)) + return -EINVAL; + + order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); /* - * Allocate a few more pages. - * - * TODO: have some parser search vmlinux before - * final linking to find all calls to ftrace. - * Then we can: - * a) know how many pages to allocate. - * and/or - * b) set up the table then. - * - * The dynamic code is still necessary for - * modules. + * We want to fill as much as possible. No more than a page + * may be empty. */ + while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE) + order--; - pg = ftrace_pages = ftrace_pages_start; + again: + pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - cnt = num_to_init / ENTRIES_PER_PAGE; - pr_info("ftrace: allocating %ld entries in %d pages\n", - num_to_init, cnt + 1); + if (!pg->records) { + /* if we can't allocate this size, try something smaller */ + if (!order) + return -ENOMEM; + order >>= 1; + goto again; + } - for (i = 0; i < cnt; i++) { - pg->next = (void *)get_zeroed_page(GFP_KERNEL); + cnt = (PAGE_SIZE << order) / ENTRY_SIZE; + pg->size = cnt; - /* If we fail, we'll try later anyway */ - if (!pg->next) + if (cnt > count) + cnt = count; + + return cnt; +} + +static struct ftrace_page * +ftrace_allocate_pages(unsigned long num_to_init) +{ + struct ftrace_page *start_pg; + struct ftrace_page *pg; + int order; + int cnt; + + if (!num_to_init) + return 0; + + start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); + if (!pg) + return NULL; + + /* + * Try to allocate as much as possible in one continues + * location that fills in all of the space. We want to + * waste as little space as possible. + */ + for (;;) { + cnt = ftrace_allocate_records(pg, num_to_init); + if (cnt < 0) + goto free_pages; + + num_to_init -= cnt; + if (!num_to_init) break; + pg->next = kzalloc(sizeof(*pg), GFP_KERNEL); + if (!pg->next) + goto free_pages; + pg = pg->next; } - return 0; + return start_pg; + + free_pages: + while (start_pg) { + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + start_pg = pg->next; + kfree(pg); + pg = start_pg; + } + pr_info("ftrace: FAILED to allocate memory for functions\n"); + return NULL; } -enum { - FTRACE_ITER_FILTER = (1 << 0), - FTRACE_ITER_NOTRACE = (1 << 1), - FTRACE_ITER_PRINTALL = (1 << 2), - FTRACE_ITER_HASH = (1 << 3), - FTRACE_ITER_ENABLED = (1 << 4), -}; +static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) +{ + int cnt; + + if (!num_to_init) { + pr_info("ftrace: No functions to be traced?\n"); + return -1; + } + + cnt = num_to_init / ENTRIES_PER_PAGE; + pr_info("ftrace: allocating %ld entries in %d pages\n", + num_to_init, cnt + 1); + + return 0; +} #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -1980,6 +2198,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) void *p = NULL; loff_t l; + if (!(iter->flags & FTRACE_ITER_DO_HASH)) + return NULL; + if (iter->func_pos > *pos) return NULL; @@ -2023,7 +2244,7 @@ static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = &global_ops; + struct ftrace_ops *ops = iter->ops; struct dyn_ftrace *rec = NULL; if (unlikely(ftrace_disabled)) @@ -2047,9 +2268,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } else { rec = &iter->pg->records[iter->idx++]; - if ((rec->flags & FTRACE_FL_FREE) || - - ((iter->flags & FTRACE_ITER_FILTER) && + if (((iter->flags & FTRACE_ITER_FILTER) && !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && @@ -2081,7 +2300,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = &global_ops; + struct ftrace_ops *ops = iter->ops; void *p = NULL; loff_t l; @@ -2101,7 +2320,8 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { + if (iter->flags & FTRACE_ITER_FILTER && + ftrace_hash_empty(ops->filter_hash)) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; @@ -2126,12 +2346,8 @@ static void *t_start(struct seq_file *m, loff_t *pos) break; } - if (!p) { - if (iter->flags & FTRACE_ITER_FILTER) - return t_hash_start(m, pos); - - return NULL; - } + if (!p) + return t_hash_start(m, pos); return iter; } @@ -2189,6 +2405,7 @@ ftrace_avail_open(struct inode *inode, struct file *file) return -ENOMEM; iter->pg = ftrace_pages_start; + iter->ops = &global_ops; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -2217,6 +2434,7 @@ ftrace_enabled_open(struct inode *inode, struct file *file) iter->pg = ftrace_pages_start; iter->flags = FTRACE_ITER_ENABLED; + iter->ops = &global_ops; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -2237,7 +2455,23 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) mutex_unlock(&ftrace_lock); } -static int +/** + * ftrace_regex_open - initialize function tracer filter files + * @ops: The ftrace_ops that hold the hash filters + * @flag: The type of filter to process + * @inode: The inode, usually passed in to your open routine + * @file: The file, usually passed in to your open routine + * + * ftrace_regex_open() initializes the filter files for the + * @ops. Depending on @flag it may process the filter hash or + * the notrace hash of @ops. With this called from the open + * routine, you can use ftrace_filter_write() for the write + * routine if @flag has FTRACE_ITER_FILTER set, or + * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. + * ftrace_regex_lseek() should be used as the lseek routine, and + * release must call ftrace_regex_release(). + */ +int ftrace_regex_open(struct ftrace_ops *ops, int flag, struct inode *inode, struct file *file) { @@ -2306,8 +2540,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, static int ftrace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, - inode, file); + return ftrace_regex_open(&global_ops, + FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, + inode, file); } static int @@ -2317,7 +2552,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file) inode, file); } -static loff_t +loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin) { loff_t ret; @@ -2426,7 +2661,6 @@ match_records(struct ftrace_hash *hash, char *buff, goto out_unlock; do_for_each_ftrace_rec(pg, rec) { - if (ftrace_match_record(rec, mod, search, search_len, type)) { ret = enter_record(hash, rec, not); if (ret < 0) { @@ -2871,14 +3105,14 @@ out_unlock: return ret; } -static ssize_t +ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { return ftrace_regex_write(file, ubuf, cnt, ppos, 1); } -static ssize_t +ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -2919,7 +3153,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, ret = ftrace_hash_move(ops, enable, orig_hash, hash); if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + ftrace_run_update_code(FTRACE_UPDATE_CALLS); mutex_unlock(&ftrace_lock); @@ -3045,8 +3279,8 @@ static void __init set_ftrace_early_graph(char *buf) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -static void __init -set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) +void __init +ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) { char *func; @@ -3059,17 +3293,16 @@ set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) static void __init set_ftrace_early_filters(void) { if (ftrace_filter_buf[0]) - set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); + ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1); if (ftrace_notrace_buf[0]) - set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); + ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_graph_buf[0]) set_ftrace_early_graph(ftrace_graph_buf); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ } -static int -ftrace_regex_release(struct inode *inode, struct file *file) +int ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; @@ -3107,7 +3340,7 @@ ftrace_regex_release(struct inode *inode, struct file *file) orig_hash, iter->hash); if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + ftrace_run_update_code(FTRACE_UPDATE_CALLS); mutex_unlock(&ftrace_lock); } @@ -3270,9 +3503,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_FREE) - continue; - if (ftrace_match_record(rec, NULL, search, search_len, type)) { /* if it is in the array */ exists = false; @@ -3381,15 +3611,62 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) return 0; } +static void ftrace_swap_recs(void *a, void *b, int size) +{ + struct dyn_ftrace *reca = a; + struct dyn_ftrace *recb = b; + struct dyn_ftrace t; + + t = *reca; + *reca = *recb; + *recb = t; +} + static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { + struct ftrace_page *pg; + unsigned long count; unsigned long *p; unsigned long addr; unsigned long flags = 0; /* Shut up gcc */ + int ret = -ENOMEM; + + count = end - start; + + if (!count) + return 0; + + pg = ftrace_allocate_pages(count); + if (!pg) + return -ENOMEM; mutex_lock(&ftrace_lock); + + /* + * Core and each module needs their own pages, as + * modules will free them when they are removed. + * Force a new page to be allocated for modules. + */ + if (!mod) { + WARN_ON(ftrace_pages || ftrace_pages_start); + /* First initialization */ + ftrace_pages = ftrace_pages_start = pg; + } else { + if (!ftrace_pages) + goto out; + + if (WARN_ON(ftrace_pages->next)) { + /* Hmm, we have free pages? */ + while (ftrace_pages->next) + ftrace_pages = ftrace_pages->next; + } + + ftrace_pages->next = pg; + ftrace_pages = pg; + } + p = start; while (p < end) { addr = ftrace_call_adjust(*p++); @@ -3401,9 +3678,18 @@ static int ftrace_process_locs(struct module *mod, */ if (!addr) continue; - ftrace_record_ip(addr); + if (!ftrace_record_ip(addr)) + break; } + /* These new locations need to be initialized */ + ftrace_new_pgs = pg; + + /* Make each individual set of pages sorted by ips */ + for (; pg; pg = pg->next) + sort(pg->records, pg->index, sizeof(struct dyn_ftrace), + ftrace_cmp_recs, ftrace_swap_recs); + /* * We only need to disable interrupts on start up * because we are modifying code that an interrupt @@ -3417,32 +3703,55 @@ static int ftrace_process_locs(struct module *mod, ftrace_update_code(mod); if (!mod) local_irq_restore(flags); + ret = 0; + out: mutex_unlock(&ftrace_lock); - return 0; + return ret; } #ifdef CONFIG_MODULES + +#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) + void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; + struct ftrace_page **last_pg; struct ftrace_page *pg; + int order; mutex_lock(&ftrace_lock); if (ftrace_disabled) goto out_unlock; - do_for_each_ftrace_rec(pg, rec) { + /* + * Each module has its own ftrace_pages, remove + * them from the list. + */ + last_pg = &ftrace_pages_start; + for (pg = ftrace_pages_start; pg; pg = *last_pg) { + rec = &pg->records[0]; if (within_module_core(rec->ip, mod)) { /* - * rec->ip is changed in ftrace_free_rec() - * It should not between s and e if record was freed. + * As core pages are first, the first + * page should never be a module page. */ - FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); - ftrace_free_rec(rec); - } - } while_for_each_ftrace_rec(); + if (WARN_ON(pg == ftrace_pages_start)) + goto out_unlock; + + /* Check if we are deleting the last page */ + if (pg == ftrace_pages) + ftrace_pages = next_to_ftrace_page(last_pg); + + *last_pg = pg->next; + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + kfree(pg); + } else + last_pg = &pg->next; + } out_unlock: mutex_unlock(&ftrace_lock); } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f04cc3136bd..24aee712745 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1738,11 +1738,121 @@ static int replace_system_preds(struct event_subsystem *system, return -ENOMEM; } +static int create_filter_start(char *filter_str, bool set_str, + struct filter_parse_state **psp, + struct event_filter **filterp) +{ + struct event_filter *filter; + struct filter_parse_state *ps = NULL; + int err = 0; + + WARN_ON_ONCE(*psp || *filterp); + + /* allocate everything, and if any fails, free all and fail */ + filter = __alloc_filter(); + if (filter && set_str) + err = replace_filter_string(filter, filter_str); + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + + if (!filter || !ps || err) { + kfree(ps); + __free_filter(filter); + return -ENOMEM; + } + + /* we're committed to creating a new filter */ + *filterp = filter; + *psp = ps; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err && set_str) + append_filter_err(ps, filter); + return err; +} + +static void create_filter_finish(struct filter_parse_state *ps) +{ + if (ps) { + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + } +} + +/** + * create_filter - create a filter for a ftrace_event_call + * @call: ftrace_event_call to create a filter for + * @filter_str: filter string + * @set_str: remember @filter_str and enable detailed error in filter + * @filterp: out param for created filter (always updated on return) + * + * Creates a filter for @call with @filter_str. If @set_str is %true, + * @filter_str is copied and recorded in the new filter. + * + * On success, returns 0 and *@filterp points to the new filter. On + * failure, returns -errno and *@filterp may point to %NULL or to a new + * filter. In the latter case, the returned filter contains error + * information if @set_str is %true and the caller is responsible for + * freeing it. + */ +static int create_filter(struct ftrace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp) +{ + struct event_filter *filter = NULL; + struct filter_parse_state *ps = NULL; + int err; + + err = create_filter_start(filter_str, set_str, &ps, &filter); + if (!err) { + err = replace_preds(call, filter, ps, filter_str, false); + if (err && set_str) + append_filter_err(ps, filter); + } + create_filter_finish(ps); + + *filterp = filter; + return err; +} + +/** + * create_system_filter - create a filter for an event_subsystem + * @system: event_subsystem to create a filter for + * @filter_str: filter string + * @filterp: out param for created filter (always updated on return) + * + * Identical to create_filter() except that it creates a subsystem filter + * and always remembers @filter_str. + */ +static int create_system_filter(struct event_subsystem *system, + char *filter_str, struct event_filter **filterp) +{ + struct event_filter *filter = NULL; + struct filter_parse_state *ps = NULL; + int err; + + err = create_filter_start(filter_str, true, &ps, &filter); + if (!err) { + err = replace_system_preds(system, ps, filter_str); + if (!err) { + /* System filters just show a default message */ + kfree(filter->filter_string); + filter->filter_string = NULL; + } else { + append_filter_err(ps, filter); + } + } + create_filter_finish(ps); + + *filterp = filter; + return err; +} + int apply_event_filter(struct ftrace_event_call *call, char *filter_string) { - struct filter_parse_state *ps; struct event_filter *filter; - struct event_filter *tmp; int err = 0; mutex_lock(&event_mutex); @@ -1759,49 +1869,30 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out_unlock; } - err = -ENOMEM; - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto out_unlock; - - filter = __alloc_filter(); - if (!filter) { - kfree(ps); - goto out_unlock; - } - - replace_filter_string(filter, filter_string); - - parse_init(ps, filter_ops, filter_string); - err = filter_parse(ps); - if (err) { - append_filter_err(ps, filter); - goto out; - } + err = create_filter(call, filter_string, true, &filter); - err = replace_preds(call, filter, ps, filter_string, false); - if (err) { - filter_disable(call); - append_filter_err(ps, filter); - } else - call->flags |= TRACE_EVENT_FL_FILTERED; -out: /* * Always swap the call filter with the new filter * even if there was an error. If there was an error * in the filter, we disable the filter and show the error * string */ - tmp = call->filter; - rcu_assign_pointer(call->filter, filter); - if (tmp) { - /* Make sure the call is done with the filter */ - synchronize_sched(); - __free_filter(tmp); + if (filter) { + struct event_filter *tmp = call->filter; + + if (!err) + call->flags |= TRACE_EVENT_FL_FILTERED; + else + filter_disable(call); + + rcu_assign_pointer(call->filter, filter); + + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + __free_filter(tmp); + } } - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); out_unlock: mutex_unlock(&event_mutex); @@ -1811,7 +1902,6 @@ out_unlock: int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { - struct filter_parse_state *ps; struct event_filter *filter; int err = 0; @@ -1835,48 +1925,19 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out_unlock; } - err = -ENOMEM; - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto out_unlock; - - filter = __alloc_filter(); - if (!filter) - goto out; - - /* System filters just show a default message */ - kfree(filter->filter_string); - filter->filter_string = NULL; - - /* - * No event actually uses the system filter - * we can free it without synchronize_sched(). - */ - __free_filter(system->filter); - system->filter = filter; - - parse_init(ps, filter_ops, filter_string); - err = filter_parse(ps); - if (err) - goto err_filter; - - err = replace_system_preds(system, ps, filter_string); - if (err) - goto err_filter; - -out: - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); + err = create_system_filter(system, filter_string, &filter); + if (filter) { + /* + * No event actually uses the system filter + * we can free it without synchronize_sched(). + */ + __free_filter(system->filter); + system->filter = filter; + } out_unlock: mutex_unlock(&event_mutex); return err; - -err_filter: - replace_filter_string(filter, filter_string); - append_filter_err(ps, system->filter); - goto out; } #ifdef CONFIG_PERF_EVENTS @@ -1894,7 +1955,6 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, { int err; struct event_filter *filter; - struct filter_parse_state *ps; struct ftrace_event_call *call; mutex_lock(&event_mutex); @@ -1909,33 +1969,10 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, if (event->filter) goto out_unlock; - filter = __alloc_filter(); - if (!filter) { - err = PTR_ERR(filter); - goto out_unlock; - } - - err = -ENOMEM; - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto free_filter; - - parse_init(ps, filter_ops, filter_str); - err = filter_parse(ps); - if (err) - goto free_ps; - - err = replace_preds(call, filter, ps, filter_str, false); + err = create_filter(call, filter_str, false, &filter); if (!err) event->filter = filter; - -free_ps: - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); - -free_filter: - if (err) + else __free_filter(filter); out_unlock: @@ -1954,43 +1991,6 @@ out_unlock: #define CREATE_TRACE_POINTS #include "trace_events_filter_test.h" -static int test_get_filter(char *filter_str, struct ftrace_event_call *call, - struct event_filter **pfilter) -{ - struct event_filter *filter; - struct filter_parse_state *ps; - int err = -ENOMEM; - - filter = __alloc_filter(); - if (!filter) - goto out; - - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto free_filter; - - parse_init(ps, filter_ops, filter_str); - err = filter_parse(ps); - if (err) - goto free_ps; - - err = replace_preds(call, filter, ps, filter_str, false); - if (!err) - *pfilter = filter; - - free_ps: - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); - - free_filter: - if (err) - __free_filter(filter); - - out: - return err; -} - #define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ { \ .filter = FILTER, \ @@ -2109,12 +2109,13 @@ static __init int ftrace_test_event_filter(void) struct test_filter_data_t *d = &test_filter_data[i]; int err; - err = test_get_filter(d->filter, &event_ftrace_test_filter, - &filter); + err = create_filter(&event_ftrace_test_filter, d->filter, + false, &filter); if (err) { printk(KERN_INFO "Failed to get filter for '%s', err %d\n", d->filter, err); + __free_filter(filter); break; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 77575b386d9..d4545f49242 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -13,6 +13,9 @@ #include <linux/sysctl.h> #include <linux/init.h> #include <linux/fs.h> + +#include <asm/setup.h> + #include "trace.h" #define STACK_TRACE_ENTRIES 500 @@ -133,7 +136,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = stack_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, }; static ssize_t @@ -311,6 +313,21 @@ static const struct file_operations stack_trace_fops = { .release = seq_release, }; +static int +stack_trace_filter_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER, + inode, file); +} + +static const struct file_operations stack_trace_filter_fops = { + .open = stack_trace_filter_open, + .read = seq_read, + .write = ftrace_filter_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_regex_release, +}; + int stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -338,8 +355,13 @@ stack_trace_sysctl(struct ctl_table *table, int write, return ret; } +static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata; + static __init int enable_stacktrace(char *str) { + if (strncmp(str, "_filter=", 8) == 0) + strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE); + stack_tracer_enabled = 1; last_stack_tracer_enabled = 1; return 1; @@ -358,6 +380,12 @@ static __init int stack_trace_init(void) trace_create_file("stack_trace", 0444, d_tracer, NULL, &stack_trace_fops); + trace_create_file("stack_trace_filter", 0444, d_tracer, + NULL, &stack_trace_filter_fops); + + if (stack_trace_filter_buf[0]) + ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); + if (stack_tracer_enabled) register_ftrace_function(&trace_ops); diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index f40a6af6bf4..54e35c1e594 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -462,7 +462,7 @@ __has_rel_mcount(Elf_Shdr const *const relhdr, /* is SHT_REL or SHT_RELA */ succeed_file(); } if (w(txthdr->sh_type) != SHT_PROGBITS || - !(w(txthdr->sh_flags) & SHF_EXECINSTR)) + !(_w(txthdr->sh_flags) & SHF_EXECINSTR)) return NULL; return txtname; } diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index 7a527f7e9da..ddc22525228 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -21,6 +21,8 @@ EVENT MODIFIERS Events can optionally have a modifer by appending a colon and one or more modifiers. Modifiers allow the user to restrict when events are counted with 'u' for user-space, 'k' for kernel, 'h' for hypervisor. +Additional modifiers are 'G' for guest counting (in KVM guests) and 'H' +for host counting (not in KVM guests). The 'p' modifier can be used for specifying how precise the instruction address should be. The 'p' modifier is currently only implemented for diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index c12659d8cb2..1078c5fadd5 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -1,4 +1,5 @@ tools/perf +include/linux/const.h include/linux/perf_event.h include/linux/rbtree.h include/linux/list.h diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 214ba7f9f57..806e0a28663 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -235,7 +235,7 @@ out_delete: } static const char * const annotate_usage[] = { - "perf annotate [<options>] <command>", + "perf annotate [<options>]", NULL }; @@ -313,10 +313,5 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used) annotate.sym_hist_filter = argv[0]; } - if (field_sep && *field_sep == '.') { - pr_err("'.' is the only non valid --field-separator argument\n"); - return -1; - } - return __cmd_annotate(&annotate); } diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index fe1ad8f2196..39104c0beea 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -108,7 +108,9 @@ static void setup_cpunode_map(void) continue; cpunode_map[cpu] = mem; } + closedir(dir2); } + closedir(dir1); } static void insert_alloc_stat(unsigned long call_site, unsigned long ptr, @@ -645,6 +647,7 @@ static int setup_sorting(struct list_head *sort_list, const char *arg) break; if (sort_dimension__add(tok, sort_list) < 0) { error("Unknown --sort key: '%s'", tok); + free(str); return -1; } } diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 032324a76b8..9fc6e0fa3dc 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -22,9 +22,6 @@ static const char *file_name; static char name_buffer[256]; -bool perf_host = 1; -bool perf_guest; - static const char * const kvm_usage[] = { "perf kvm [<options>] {top|record|report|diff|buildid-list}", NULL @@ -107,7 +104,8 @@ static int __cmd_buildid_list(int argc, const char **argv) int cmd_kvm(int argc, const char **argv, const char *prefix __used) { - perf_host = perf_guest = 0; + perf_host = 0; + perf_guest = 1; argc = parse_options(argc, argv, kvm_options, kvm_usage, PARSE_OPT_STOP_AT_NON_OPTION); diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index fd1909afcfd..bb68ddf257b 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1018,13 +1018,17 @@ static char *get_script_path(const char *script_root, const char *suffix) __script_root = get_script_root(&script_dirent, suffix); if (__script_root && !strcmp(script_root, __script_root)) { free(__script_root); + closedir(lang_dir); + closedir(scripts_dir); snprintf(script_path, MAXPATHLEN, "%s/%s", lang_path, script_dirent.d_name); return strdup(script_path); } free(__script_root); } + closedir(lang_dir); } + closedir(scripts_dir); return NULL; } diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 2b9a7f497a2..3854e869dce 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -1396,7 +1396,7 @@ int cmd_test(int argc, const char **argv, const char *prefix __used) NULL, }; const struct option test_options[] = { - OPT_INTEGER('v', "verbose", &verbose, + OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_END() }; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 4f81eeb9987..8f80df89603 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -235,7 +235,6 @@ static struct hist_entry *perf_evsel__add_hist_entry(struct perf_evsel *evsel, if (he == NULL) return NULL; - evsel->hists.stats.total_period += sample->period; hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE); return he; } @@ -889,6 +888,10 @@ try_again: ui__warning("The %s event is not supported.\n", event_name(counter)); goto out_err; + } else if (err == EMFILE) { + ui__warning("Too many events are opened.\n" + "Try again after reducing the number of events\n"); + goto out_err; } ui__warning("The sys_perf_event_open() syscall " diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index fa1837088ca..3f16e08a5c8 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -111,8 +111,11 @@ int perf_evlist__add_default(struct perf_evlist *evlist) .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, }; - struct perf_evsel *evsel = perf_evsel__new(&attr, 0); + struct perf_evsel *evsel; + + event_attr_init(&attr); + evsel = perf_evsel__new(&attr, 0); if (evsel == NULL) goto error; diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index abef2703cd2..6f505d1abac 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -76,21 +76,21 @@ static void hists__calc_col_len(struct hists *hists, struct hist_entry *h) } } -static void hist_entry__add_cpumode_period(struct hist_entry *self, +static void hist_entry__add_cpumode_period(struct hist_entry *he, unsigned int cpumode, u64 period) { switch (cpumode) { case PERF_RECORD_MISC_KERNEL: - self->period_sys += period; + he->period_sys += period; break; case PERF_RECORD_MISC_USER: - self->period_us += period; + he->period_us += period; break; case PERF_RECORD_MISC_GUEST_KERNEL: - self->period_guest_sys += period; + he->period_guest_sys += period; break; case PERF_RECORD_MISC_GUEST_USER: - self->period_guest_us += period; + he->period_guest_us += period; break; default: break; @@ -165,18 +165,18 @@ void hists__decay_entries_threaded(struct hists *hists, static struct hist_entry *hist_entry__new(struct hist_entry *template) { size_t callchain_size = symbol_conf.use_callchain ? sizeof(struct callchain_root) : 0; - struct hist_entry *self = malloc(sizeof(*self) + callchain_size); + struct hist_entry *he = malloc(sizeof(*he) + callchain_size); - if (self != NULL) { - *self = *template; - self->nr_events = 1; - if (self->ms.map) - self->ms.map->referenced = true; + if (he != NULL) { + *he = *template; + he->nr_events = 1; + if (he->ms.map) + he->ms.map->referenced = true; if (symbol_conf.use_callchain) - callchain_init(self->callchain); + callchain_init(he->callchain); } - return self; + return he; } static void hists__inc_nr_entries(struct hists *hists, struct hist_entry *h) @@ -677,15 +677,16 @@ static size_t callchain__fprintf_flat(FILE *fp, struct callchain_node *self, return ret; } -static size_t hist_entry_callchain__fprintf(FILE *fp, struct hist_entry *self, - u64 total_samples, int left_margin) +static size_t hist_entry_callchain__fprintf(struct hist_entry *he, + u64 total_samples, int left_margin, + FILE *fp) { struct rb_node *rb_node; struct callchain_node *chain; size_t ret = 0; u32 entries_printed = 0; - rb_node = rb_first(&self->sorted_chain); + rb_node = rb_first(&he->sorted_chain); while (rb_node) { double percent; @@ -730,35 +731,35 @@ void hists__output_recalc_col_len(struct hists *hists, int max_rows) } } -static int hist_entry__pcnt_snprintf(struct hist_entry *self, char *s, +static int hist_entry__pcnt_snprintf(struct hist_entry *he, char *s, size_t size, struct hists *pair_hists, bool show_displacement, long displacement, - bool color, u64 session_total) + bool color, u64 total_period) { u64 period, total, period_sys, period_us, period_guest_sys, period_guest_us; u64 nr_events; const char *sep = symbol_conf.field_sep; int ret; - if (symbol_conf.exclude_other && !self->parent) + if (symbol_conf.exclude_other && !he->parent) return 0; if (pair_hists) { - period = self->pair ? self->pair->period : 0; - nr_events = self->pair ? self->pair->nr_events : 0; + period = he->pair ? he->pair->period : 0; + nr_events = he->pair ? he->pair->nr_events : 0; total = pair_hists->stats.total_period; - period_sys = self->pair ? self->pair->period_sys : 0; - period_us = self->pair ? self->pair->period_us : 0; - period_guest_sys = self->pair ? self->pair->period_guest_sys : 0; - period_guest_us = self->pair ? self->pair->period_guest_us : 0; + period_sys = he->pair ? he->pair->period_sys : 0; + period_us = he->pair ? he->pair->period_us : 0; + period_guest_sys = he->pair ? he->pair->period_guest_sys : 0; + period_guest_us = he->pair ? he->pair->period_guest_us : 0; } else { - period = self->period; - nr_events = self->nr_events; - total = session_total; - period_sys = self->period_sys; - period_us = self->period_us; - period_guest_sys = self->period_guest_sys; - period_guest_us = self->period_guest_us; + period = he->period; + nr_events = he->nr_events; + total = total_period; + period_sys = he->period_sys; + period_us = he->period_us; + period_guest_sys = he->period_guest_sys; + period_guest_us = he->period_guest_us; } if (total) { @@ -812,8 +813,8 @@ static int hist_entry__pcnt_snprintf(struct hist_entry *self, char *s, if (total > 0) old_percent = (period * 100.0) / total; - if (session_total > 0) - new_percent = (self->period * 100.0) / session_total; + if (total_period > 0) + new_percent = (he->period * 100.0) / total_period; diff = new_percent - old_percent; @@ -862,9 +863,10 @@ int hist_entry__snprintf(struct hist_entry *he, char *s, size_t size, return ret; } -int hist_entry__fprintf(struct hist_entry *he, size_t size, struct hists *hists, - struct hists *pair_hists, bool show_displacement, - long displacement, FILE *fp, u64 session_total) +static int hist_entry__fprintf(struct hist_entry *he, size_t size, + struct hists *hists, struct hists *pair_hists, + bool show_displacement, long displacement, + u64 total_period, FILE *fp) { char bf[512]; int ret; @@ -874,14 +876,14 @@ int hist_entry__fprintf(struct hist_entry *he, size_t size, struct hists *hists, ret = hist_entry__pcnt_snprintf(he, bf, size, pair_hists, show_displacement, displacement, - true, session_total); + true, total_period); hist_entry__snprintf(he, bf + ret, size - ret, hists); return fprintf(fp, "%s\n", bf); } -static size_t hist_entry__fprintf_callchain(struct hist_entry *self, - struct hists *hists, FILE *fp, - u64 session_total) +static size_t hist_entry__fprintf_callchain(struct hist_entry *he, + struct hists *hists, + u64 total_period, FILE *fp) { int left_margin = 0; @@ -889,11 +891,10 @@ static size_t hist_entry__fprintf_callchain(struct hist_entry *self, struct sort_entry *se = list_first_entry(&hist_entry__sort_list, typeof(*se), list); left_margin = hists__col_len(hists, se->se_width_idx); - left_margin -= thread__comm_len(self->thread); + left_margin -= thread__comm_len(he->thread); } - return hist_entry_callchain__fprintf(fp, self, session_total, - left_margin); + return hist_entry_callchain__fprintf(he, total_period, left_margin, fp); } size_t hists__fprintf(struct hists *hists, struct hists *pair, @@ -903,6 +904,7 @@ size_t hists__fprintf(struct hists *hists, struct hists *pair, struct sort_entry *se; struct rb_node *nd; size_t ret = 0; + u64 total_period; unsigned long position = 1; long displacement = 0; unsigned int width; @@ -917,20 +919,6 @@ size_t hists__fprintf(struct hists *hists, struct hists *pair, fprintf(fp, "# %s", pair ? "Baseline" : "Overhead"); - if (symbol_conf.show_nr_samples) { - if (sep) - fprintf(fp, "%cSamples", *sep); - else - fputs(" Samples ", fp); - } - - if (symbol_conf.show_total_period) { - if (sep) - ret += fprintf(fp, "%cPeriod", *sep); - else - ret += fprintf(fp, " Period "); - } - if (symbol_conf.show_cpu_utilization) { if (sep) { ret += fprintf(fp, "%csys", *sep); @@ -940,8 +928,8 @@ size_t hists__fprintf(struct hists *hists, struct hists *pair, ret += fprintf(fp, "%cguest us", *sep); } } else { - ret += fprintf(fp, " sys "); - ret += fprintf(fp, " us "); + ret += fprintf(fp, " sys "); + ret += fprintf(fp, " us "); if (perf_guest) { ret += fprintf(fp, " guest sys "); ret += fprintf(fp, " guest us "); @@ -949,6 +937,20 @@ size_t hists__fprintf(struct hists *hists, struct hists *pair, } } + if (symbol_conf.show_nr_samples) { + if (sep) + fprintf(fp, "%cSamples", *sep); + else + fputs(" Samples ", fp); + } + + if (symbol_conf.show_total_period) { + if (sep) + ret += fprintf(fp, "%cPeriod", *sep); + else + ret += fprintf(fp, " Period "); + } + if (pair) { if (sep) ret += fprintf(fp, "%cDelta", *sep); @@ -993,6 +995,8 @@ size_t hists__fprintf(struct hists *hists, struct hists *pair, goto print_entries; fprintf(fp, "# ........"); + if (symbol_conf.show_cpu_utilization) + fprintf(fp, " ....... ......."); if (symbol_conf.show_nr_samples) fprintf(fp, " .........."); if (symbol_conf.show_total_period) @@ -1025,6 +1029,8 @@ size_t hists__fprintf(struct hists *hists, struct hists *pair, goto out; print_entries: + total_period = hists->stats.total_period; + for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) { struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node); @@ -1040,11 +1046,10 @@ print_entries: ++position; } ret += hist_entry__fprintf(h, max_cols, hists, pair, show_displacement, - displacement, fp, hists->stats.total_period); + displacement, total_period, fp); if (symbol_conf.use_callchain) - ret += hist_entry__fprintf_callchain(h, hists, fp, - hists->stats.total_period); + ret += hist_entry__fprintf_callchain(h, hists, total_period, fp); if (max_rows && ++nr_rows >= max_rows) goto out; diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index ff6f9d56ea4..f55f0a8d1f8 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -66,11 +66,8 @@ struct hists { struct hist_entry *__hists__add_entry(struct hists *self, struct addr_location *al, struct symbol *parent, u64 period); -extern int64_t hist_entry__cmp(struct hist_entry *, struct hist_entry *); -extern int64_t hist_entry__collapse(struct hist_entry *, struct hist_entry *); -int hist_entry__fprintf(struct hist_entry *he, size_t size, struct hists *hists, - struct hists *pair_hists, bool show_displacement, - long displacement, FILE *fp, u64 session_total); +int64_t hist_entry__cmp(struct hist_entry *left, struct hist_entry *right); +int64_t hist_entry__collapse(struct hist_entry *left, struct hist_entry *right); int hist_entry__snprintf(struct hist_entry *self, char *bf, size_t size, struct hists *hists); void hist_entry__free(struct hist_entry *); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 531c283fc0c..b029296d20d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -735,8 +735,8 @@ static int parse_event_modifier(const char **strp, struct perf_event_attr *attr) { const char *str = *strp; - int exclude = 0; - int eu = 0, ek = 0, eh = 0, precise = 0; + int exclude = 0, exclude_GH = 0; + int eu = 0, ek = 0, eh = 0, eH = 0, eG = 0, precise = 0; if (!*str) return 0; @@ -760,6 +760,14 @@ parse_event_modifier(const char **strp, struct perf_event_attr *attr) if (!exclude) exclude = eu = ek = eh = 1; eh = 0; + } else if (*str == 'G') { + if (!exclude_GH) + exclude_GH = eG = eH = 1; + eG = 0; + } else if (*str == 'H') { + if (!exclude_GH) + exclude_GH = eG = eH = 1; + eH = 0; } else if (*str == 'p') { precise++; } else @@ -776,6 +784,8 @@ parse_event_modifier(const char **strp, struct perf_event_attr *attr) attr->exclude_kernel = ek; attr->exclude_hv = eh; attr->precise_ip = precise; + attr->exclude_host = eH; + attr->exclude_guest = eG; return 0; } @@ -838,6 +848,7 @@ int parse_events(struct perf_evlist *evlist , const char *str, int unset __used) for (;;) { ostr = str; memset(&attr, 0, sizeof(attr)); + event_attr_init(&attr); ret = parse_event_symbols(evlist, &str, &attr); if (ret == EVT_FAILED) return -1; diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c index ac6830d8292..fc22cf5c605 100644 --- a/tools/perf/util/trace-event-info.c +++ b/tools/perf/util/trace-event-info.c @@ -18,7 +18,6 @@ * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -#include <ctype.h> #include "util.h" #include <dirent.h> #include <mntent.h> diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 5b3ea49aa63..813141047fc 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -1,6 +1,21 @@ +#include "../perf.h" #include "util.h" #include <sys/mman.h> +/* + * XXX We need to find a better place for these things... + */ +bool perf_host = true; +bool perf_guest = true; + +void event_attr_init(struct perf_event_attr *attr) +{ + if (!perf_host) + attr->exclude_host = 1; + if (!perf_guest) + attr->exclude_guest = 1; +} + int mkdir_p(char *path, mode_t mode) { struct stat st; diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 37be34dff79..b9c530cce79 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -242,6 +242,10 @@ int strtailcmp(const char *s1, const char *s2); unsigned long convert_unit(unsigned long value, char *unit); int readn(int fd, void *buf, size_t size); +struct perf_event_attr; + +void event_attr_init(struct perf_event_attr *attr); + #define _STR(x) #x #define STR(x) _STR(x) |