From 16444a8a40d4c7b4f6de34af0cae1f76a4f6c901 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: add basic support for gcc profiler instrumentation If CONFIG_FTRACE is selected and /proc/sys/kernel/ftrace_enabled is set to a non-zero value the ftrace routine will be called everytime we enter a kernel function that is not marked with the "notrace" attribute. The ftrace routine will then call a registered function if a function happens to be registered. [ This code has been highly hacked by Steven Rostedt and Ingo Molnar, so don't blame Arnaldo for all of this ;-) ] Update: It is now possible to register more than one ftrace function. If only one ftrace function is registered, that will be the function that ftrace calls directly. If more than one function is registered, then ftrace will call a function that will loop through the functions to call. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index 1c9938addb9..fa05f6d8bdb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_FTRACE) += trace/ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is -- cgit v1.2.3-70-g09d2 From bc0c38d139ec7fcd5c030aea16b008f3732e42ac Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: latency tracer infrastructure This patch adds the latency tracer infrastructure. This patch does not add anything that will select and turn it on, but will be used by later patches. If it were to be compiled, it would add the following files to the debugfs: The root tracing directory: /debugfs/tracing/ This patch also adds the following files: available_tracers list of available tracers. Currently no tracers are available. Looking into this file only shows "none" which is used to unregister all tracers. current_tracer The trace that is currently active. Empty on start up. To switch to a tracer simply echo one of the tracers that are listed in available_tracers: example: (used with later patches) echo function > /debugfs/tracing/current_tracer To disable the tracer: echo disable > /debugfs/tracing/current_tracer tracing_enabled echoing "1" into this file starts the ftrace function tracing (if sysctl kernel.ftrace_enabled=1) echoing "0" turns it off. latency_trace This file is readonly and holds the result of the trace. trace This file outputs a easier to read version of the trace. iter_ctrl Controls the way the output of traces look. So far there's two controls: echoing in "symonly" will only show the kallsyms variables without the addresses (if kallsyms was configured) echoing in "verbose" will change the output to show a lot more data, but not very easy to understand by humans. echoing in "nosymonly" turns off symonly. echoing in "noverbose" turns off verbose. Signed-off-by: Steven Rostedt Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/Makefile | 1 + kernel/trace/Kconfig | 5 + kernel/trace/Makefile | 2 + kernel/trace/trace.c | 1547 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 184 ++++++ 5 files changed, 1739 insertions(+) create mode 100644 kernel/trace/trace.c create mode 100644 kernel/trace/trace.h (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index fa05f6d8bdb..7e344e7b0cb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -70,6 +70,7 @@ obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_FTRACE) += trace/ +obj-$(CONFIG_TRACING) += trace/ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8185c91417b..ce70677afbf 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -3,3 +3,8 @@ # config HAVE_FTRACE bool + +config TRACING + bool + select DEBUG_FS + diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index bf4fd215a6a..7af40317525 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,3 +1,5 @@ obj-$(CONFIG_FTRACE) += libftrace.o +obj-$(CONFIG_TRACING) += trace.o + libftrace-y := ftrace.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c new file mode 100644 index 00000000000..1b8eca7650d --- /dev/null +++ b/kernel/trace/trace.c @@ -0,0 +1,1547 @@ +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Originally taken from the RT patch by: + * Arnaldo Carvalho de Melo + * + * Based on code from the latency_tracer, that is: + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; +unsigned long __read_mostly tracing_thresh; + +static long notrace +ns2usecs(cycle_t nsec) +{ + nsec += 500; + do_div(nsec, 1000); + return nsec; +} + +static atomic_t tracer_counter; +static struct trace_array global_trace; + +static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); + +static struct trace_array max_tr; + +static DEFINE_PER_CPU(struct trace_array_cpu, max_data); + +static int tracer_enabled; +static unsigned long trace_nr_entries = 4096UL; + +static struct tracer *trace_types __read_mostly; +static struct tracer *current_trace __read_mostly; +static int max_tracer_type_len; + +static DEFINE_MUTEX(trace_types_lock); + +static int __init set_nr_entries(char *str) +{ + if (!str) + return 0; + trace_nr_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("trace_entries=", set_nr_entries); + +enum trace_type { + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_CTX, + + __TRACE_LAST_TYPE +}; + +enum trace_flag_type { + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED = 0x02, + TRACE_FLAG_HARDIRQ = 0x04, + TRACE_FLAG_SOFTIRQ = 0x08, +}; + +enum trace_iterator_flags { + TRACE_ITER_PRINT_PARENT = 0x01, + TRACE_ITER_SYM_OFFSET = 0x02, + TRACE_ITER_SYM_ADDR = 0x04, + TRACE_ITER_VERBOSE = 0x08, +}; + +#define TRACE_ITER_SYM_MASK \ + (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) + +/* These must match the bit postions above */ +static const char *trace_options[] = { + "print-parent", + "sym-offset", + "sym-addr", + "verbose", + NULL +}; + +static unsigned trace_flags; + + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /debugfs/tracing/latency_trace) + */ +static void notrace +__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + + max_tr.cpu = cpu; + max_tr.time_start = data->preempt_timestamp; + + data = max_tr.data[cpu]; + data->saved_latency = tracing_max_latency; + + memcpy(data->comm, tsk->comm, TASK_COMM_LEN); + data->pid = tsk->pid; + data->uid = tsk->uid; + data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + data->policy = tsk->policy; + data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(current); +} + +notrace void +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data; + void *save_trace; + int i; + + /* clear out all the previous traces */ + for_each_possible_cpu(i) { + data = tr->data[i]; + save_trace = max_tr.data[i]->trace; + memcpy(max_tr.data[i], data, sizeof(*data)); + data->trace = save_trace; + } + + __update_max_tr(tr, tsk, cpu); +} + +/** + * update_max_tr_single - only copy one trace over, and reset the rest + * @tr - tracer + * @tsk - task with the latency + * @cpu - the cpu of the buffer to copy. + */ +notrace void +update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + void *save_trace; + int i; + + for_each_possible_cpu(i) + tracing_reset(max_tr.data[i]); + + save_trace = max_tr.data[cpu]->trace; + memcpy(max_tr.data[cpu], data, sizeof(*data)); + data->trace = save_trace; + + __update_max_tr(tr, tsk, cpu); +} + +int register_tracer(struct tracer *type) +{ + struct tracer *t; + int len; + int ret = 0; + + if (!type->name) { + pr_info("Tracer must have a name\n"); + return -1; + } + + mutex_lock(&trace_types_lock); + for (t = trace_types; t; t = t->next) { + if (strcmp(type->name, t->name) == 0) { + /* already found */ + pr_info("Trace %s already registered\n", + type->name); + ret = -1; + goto out; + } + } + + type->next = trace_types; + trace_types = type; + len = strlen(type->name); + if (len > max_tracer_type_len) + max_tracer_type_len = len; + out: + mutex_unlock(&trace_types_lock); + + return ret; +} + +void unregister_tracer(struct tracer *type) +{ + struct tracer **t; + int len; + + mutex_lock(&trace_types_lock); + for (t = &trace_types; *t; t = &(*t)->next) { + if (*t == type) + goto found; + } + pr_info("Trace %s not registered\n", type->name); + goto out; + + found: + *t = (*t)->next; + if (strlen(type->name) != max_tracer_type_len) + goto out; + + max_tracer_type_len = 0; + for (t = &trace_types; *t; t = &(*t)->next) { + len = strlen((*t)->name); + if (len > max_tracer_type_len) + max_tracer_type_len = len; + } + out: + mutex_unlock(&trace_types_lock); +} + +void notrace tracing_reset(struct trace_array_cpu *data) +{ + data->trace_idx = 0; + atomic_set(&data->underrun, 0); +} + +#ifdef CONFIG_FTRACE +static void notrace +function_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (unlikely(!tracer_enabled)) + return; + + raw_local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + ftrace(tr, data, ip, parent_ip, flags); + + atomic_dec(&data->disabled); + raw_local_irq_restore(flags); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = function_trace_call, +}; +#endif + +notrace void tracing_start_function_trace(void) +{ + register_ftrace_function(&trace_ops); +} + +notrace void tracing_stop_function_trace(void) +{ + unregister_ftrace_function(&trace_ops); +} + +#define SAVED_CMDLINES 128 +static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; +static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; +static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; +static int cmdline_idx; +static DEFINE_SPINLOCK(trace_cmdline_lock); +atomic_t trace_record_cmdline_disabled; + +static void trace_init_cmdlines(void) +{ + memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline)); + memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid)); + cmdline_idx = 0; +} + +notrace void trace_stop_cmdline_recording(void); + +static void notrace trace_save_cmdline(struct task_struct *tsk) +{ + unsigned map; + unsigned idx; + + if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) + return; + + /* + * It's not the end of the world if we don't get + * the lock, but we also don't want to spin + * nor do we want to disable interrupts, + * so if we miss here, then better luck next time. + */ + if (!spin_trylock(&trace_cmdline_lock)) + return; + + idx = map_pid_to_cmdline[tsk->pid]; + if (idx >= SAVED_CMDLINES) { + idx = (cmdline_idx + 1) % SAVED_CMDLINES; + + map = map_cmdline_to_pid[idx]; + if (map <= PID_MAX_DEFAULT) + map_pid_to_cmdline[map] = (unsigned)-1; + + map_pid_to_cmdline[tsk->pid] = idx; + + cmdline_idx = idx; + } + + memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); + + spin_unlock(&trace_cmdline_lock); +} + +static notrace char *trace_find_cmdline(int pid) +{ + char *cmdline = "<...>"; + unsigned map; + + if (!pid) + return ""; + + if (pid > PID_MAX_DEFAULT) + goto out; + + map = map_pid_to_cmdline[pid]; + if (map >= SAVED_CMDLINES) + goto out; + + cmdline = saved_cmdlines[map]; + + out: + return cmdline; +} + +notrace void tracing_record_cmdline(struct task_struct *tsk) +{ + if (atomic_read(&trace_record_cmdline_disabled)) + return; + + trace_save_cmdline(tsk); +} + +static inline notrace struct trace_entry * +tracing_get_trace_entry(struct trace_array *tr, + struct trace_array_cpu *data) +{ + unsigned long idx, idx_next; + struct trace_entry *entry; + + idx = data->trace_idx; + idx_next = idx + 1; + + if (unlikely(idx_next >= tr->entries)) { + atomic_inc(&data->underrun); + idx_next = 0; + } + + data->trace_idx = idx_next; + + if (unlikely(idx_next != 0 && atomic_read(&data->underrun))) + atomic_inc(&data->underrun); + + entry = data->trace + idx * TRACE_ENTRY_SIZE; + + return entry; +} + +static inline notrace void +tracing_generic_entry_update(struct trace_entry *entry, + unsigned long flags) +{ + struct task_struct *tsk = current; + unsigned long pc; + + pc = preempt_count(); + + entry->idx = atomic_inc_return(&tracer_counter); + entry->preempt_count = pc & 0xff; + entry->pid = tsk->pid; + entry->t = now(raw_smp_processor_id()); + entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | + ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | + ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); +} + +notrace void +ftrace(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long ip, unsigned long parent_ip, + unsigned long flags) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_FN; + entry->fn.ip = ip; + entry->fn.parent_ip = parent_ip; +} + +notrace void +tracing_sched_switch_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *prev, struct task_struct *next, + unsigned long flags) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_CTX; + entry->ctx.prev_pid = prev->pid; + entry->ctx.prev_prio = prev->prio; + entry->ctx.prev_state = prev->state; + entry->ctx.next_pid = next->pid; + entry->ctx.next_prio = next->prio; +} + +enum trace_file_type { + TRACE_FILE_LAT_FMT = 1, +}; + +static struct trace_entry * +trace_entry_idx(struct trace_array *tr, unsigned long idx, int cpu) +{ + struct trace_entry *array = tr->data[cpu]->trace; + unsigned long underrun; + + if (idx >= tr->entries) + return NULL; + + underrun = atomic_read(&tr->data[cpu]->underrun); + if (underrun) + idx = ((underrun - 1) + idx) % tr->entries; + else if (idx >= tr->data[cpu]->trace_idx) + return NULL; + + return &array[idx]; +} + +static struct notrace trace_entry * +find_next_entry(struct trace_iterator *iter, int *ent_cpu) +{ + struct trace_array *tr = iter->tr; + struct trace_entry *ent, *next = NULL; + int next_cpu = -1; + int cpu; + + for_each_possible_cpu(cpu) { + if (!tr->data[cpu]->trace) + continue; + ent = trace_entry_idx(tr, iter->next_idx[cpu], cpu); + if (ent && + (!next || (long)(next->idx - ent->idx) > 0)) { + next = ent; + next_cpu = cpu; + } + } + + if (ent_cpu) + *ent_cpu = next_cpu; + + return next; +} + +static void *find_next_entry_inc(struct trace_iterator *iter) +{ + struct trace_entry *next; + int next_cpu = -1; + + next = find_next_entry(iter, &next_cpu); + + if (next) { + iter->next_idx[next_cpu]++; + iter->idx++; + } + iter->ent = next; + iter->cpu = next_cpu; + + return next ? iter : NULL; +} + +static void notrace * +s_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + void *ent; + void *last_ent = iter->ent; + int i = (int)*pos; + + (*pos)++; + + /* can't go backwards */ + if (iter->idx > i) + return NULL; + + if (iter->idx < 0) + ent = find_next_entry_inc(iter); + else + ent = iter; + + while (ent && iter->idx < i) + ent = find_next_entry_inc(iter); + + iter->pos = *pos; + + if (last_ent && !ent) + seq_puts(m, "\n\nvim:ft=help\n"); + + return ent; +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + void *p = NULL; + loff_t l = 0; + int i; + + mutex_lock(&trace_types_lock); + + if (!current_trace || current_trace != iter->trace) + return NULL; + + atomic_inc(&trace_record_cmdline_disabled); + + /* let the tracer grab locks here if needed */ + if (current_trace->start) + current_trace->start(iter); + + if (*pos != iter->pos) { + iter->ent = NULL; + iter->cpu = 0; + iter->idx = -1; + + for (i = 0; i < NR_CPUS; i++) + iter->next_idx[i] = 0; + + for (p = iter; p && l < *pos; p = s_next(m, p, &l)) + ; + + } else { + l = *pos; + p = s_next(m, p, &l); + } + + return p; +} + +static void s_stop(struct seq_file *m, void *p) +{ + struct trace_iterator *iter = m->private; + + atomic_dec(&trace_record_cmdline_disabled); + + /* let the tracer release locks here if needed */ + if (current_trace && current_trace == iter->trace && iter->trace->stop) + iter->trace->stop(iter); + + mutex_unlock(&trace_types_lock); +} + +static void +seq_print_sym_short(struct seq_file *m, const char *fmt, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + + kallsyms_lookup(address, NULL, NULL, NULL, str); + + seq_printf(m, fmt, str); +#endif +} + +static void +seq_print_sym_offset(struct seq_file *m, const char *fmt, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + + sprint_symbol(str, address); + seq_printf(m, fmt, str); +#endif +} + +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + +static void notrace +seq_print_ip_sym(struct seq_file *m, unsigned long ip, unsigned long sym_flags) +{ + if (!ip) { + seq_printf(m, "0"); + return; + } + + if (sym_flags & TRACE_ITER_SYM_OFFSET) + seq_print_sym_offset(m, "%s", ip); + else + seq_print_sym_short(m, "%s", ip); + + if (sym_flags & TRACE_ITER_SYM_ADDR) + seq_printf(m, " <" IP_FMT ">", ip); +} + +static void notrace print_lat_help_header(struct seq_file *m) +{ + seq_puts(m, "# _------=> CPU# \n"); + seq_puts(m, "# / _-----=> irqs-off \n"); + seq_puts(m, "# | / _----=> need-resched \n"); + seq_puts(m, "# || / _---=> hardirq/softirq \n"); + seq_puts(m, "# ||| / _--=> preempt-depth \n"); + seq_puts(m, "# |||| / \n"); + seq_puts(m, "# ||||| delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); +} + +static void notrace print_func_help_header(struct seq_file *m) +{ + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | | |\n"); +} + + +static void notrace +print_trace_header(struct seq_file *m, struct trace_iterator *iter) +{ + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_array *tr = iter->tr; + struct trace_array_cpu *data = tr->data[tr->cpu]; + struct tracer *type = current_trace; + unsigned long underruns = 0; + unsigned long underrun; + unsigned long entries = 0; + int cpu; + const char *name = "preemption"; + + if (type) + name = type->name; + + for_each_possible_cpu(cpu) { + if (tr->data[cpu]->trace) { + underrun = atomic_read(&tr->data[cpu]->underrun); + if (underrun) { + underruns += underrun; + entries += tr->entries; + } else + entries += tr->data[cpu]->trace_idx; + } + } + + seq_printf(m, "%s latency trace v1.1.5 on %s\n", + name, UTS_RELEASE); + seq_puts(m, "-----------------------------------" + "---------------------------------\n"); + seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |" + " (M:%s VP:%d, KP:%d, SP:%d HP:%d", + data->saved_latency, + entries, + (entries + underruns), + tr->cpu, +#if defined(CONFIG_PREEMPT_NONE) + "server", +#elif defined(CONFIG_PREEMPT_VOLUNTARY) + "desktop", +#elif defined(CONFIG_PREEMPT_DESKTOP) + "preempt", +#else + "unknown", +#endif + /* These are reserved for later use */ + 0, 0, 0, 0); +#ifdef CONFIG_SMP + seq_printf(m, " #P:%d)\n", num_online_cpus()); +#else + seq_puts(m, ")\n"); +#endif + seq_puts(m, " -----------------\n"); + seq_printf(m, " | task: %.16s-%d " + "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", + data->comm, data->pid, data->uid, data->nice, + data->policy, data->rt_priority); + seq_puts(m, " -----------------\n"); + + if (data->critical_start) { + seq_puts(m, " => started at: "); + seq_print_ip_sym(m, data->critical_start, sym_flags); + seq_puts(m, "\n => ended at: "); + seq_print_ip_sym(m, data->critical_end, sym_flags); + seq_puts(m, "\n"); + } + + seq_puts(m, "\n"); +} + +unsigned long nsecs_to_usecs(unsigned long nsecs) +{ + return nsecs / 1000; +} + +static void notrace +lat_print_generic(struct seq_file *m, struct trace_entry *entry, int cpu) +{ + int hardirq, softirq; + char *comm; + + comm = trace_find_cmdline(entry->pid); + + seq_printf(m, "%8.8s-%-5d ", comm, entry->pid); + seq_printf(m, "%d", cpu); + seq_printf(m, "%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + seq_putc(m, 'H'); + else { + if (hardirq) + seq_putc(m, 'h'); + else { + if (softirq) + seq_putc(m, 's'); + else + seq_putc(m, '.'); + } + } + + if (entry->preempt_count) + seq_printf(m, "%x", entry->preempt_count); + else + seq_puts(m, "."); +} + +unsigned long preempt_mark_thresh = 100; + +static void notrace +lat_print_timestamp(struct seq_file *m, unsigned long long abs_usecs, + unsigned long rel_usecs) +{ + seq_printf(m, " %4lldus", abs_usecs); + if (rel_usecs > preempt_mark_thresh) + seq_puts(m, "!: "); + else if (rel_usecs > 1) + seq_puts(m, "+: "); + else + seq_puts(m, " : "); +} + +static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; + +static void notrace +print_lat_fmt(struct seq_file *m, struct trace_iterator *iter, + unsigned int trace_idx, int cpu) +{ + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_entry *next_entry = find_next_entry(iter, NULL); + unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); + struct trace_entry *entry = iter->ent; + unsigned long abs_usecs; + unsigned long rel_usecs; + char *comm; + int S; + + if (!next_entry) + next_entry = entry; + rel_usecs = ns2usecs(next_entry->t - entry->t); + abs_usecs = ns2usecs(entry->t - iter->tr->time_start); + + if (verbose) { + comm = trace_find_cmdline(entry->pid); + seq_printf(m, "%16s %5d %d %d %08x %08x [%08lx]" + " %ld.%03ldms (+%ld.%03ldms): ", + comm, + entry->pid, cpu, entry->flags, + entry->preempt_count, trace_idx, + ns2usecs(entry->t), + abs_usecs/1000, + abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); + } else { + lat_print_generic(m, entry, cpu); + lat_print_timestamp(m, abs_usecs, rel_usecs); + } + switch (entry->type) { + case TRACE_FN: + seq_print_ip_sym(m, entry->fn.ip, sym_flags); + seq_puts(m, " ("); + seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags); + seq_puts(m, ")\n"); + break; + case TRACE_CTX: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + comm = trace_find_cmdline(entry->ctx.next_pid); + seq_printf(m, " %d:%d:%c --> %d:%d %s\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio, + comm); + break; + } +} + +static void notrace +print_trace_fmt(struct seq_file *m, struct trace_iterator *iter) +{ + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_entry *entry = iter->ent; + unsigned long usec_rem; + unsigned long long t; + unsigned long secs; + char *comm; + int S; + + comm = trace_find_cmdline(iter->ent->pid); + + t = ns2usecs(entry->t); + usec_rem = do_div(t, 1000000ULL); + secs = (unsigned long)t; + + seq_printf(m, "%16s-%-5d ", comm, entry->pid); + seq_printf(m, "[%02d] ", iter->cpu); + seq_printf(m, "%5lu.%06lu: ", secs, usec_rem); + + switch (entry->type) { + case TRACE_FN: + seq_print_ip_sym(m, entry->fn.ip, sym_flags); + if ((sym_flags & TRACE_ITER_PRINT_PARENT) && + entry->fn.parent_ip) { + seq_printf(m, " <-"); + seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags); + } + break; + case TRACE_CTX: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + seq_printf(m, " %d:%d:%c ==> %d:%d\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio); + break; + } + seq_printf(m, "\n"); +} + +static int trace_empty(struct trace_iterator *iter) +{ + struct trace_array_cpu *data; + int cpu; + + for_each_possible_cpu(cpu) { + data = iter->tr->data[cpu]; + + if (data->trace && + (data->trace_idx || + atomic_read(&data->underrun))) + return 0; + } + return 1; +} + +static int s_show(struct seq_file *m, void *v) +{ + struct trace_iterator *iter = v; + + if (iter->ent == NULL) { + if (iter->tr) { + seq_printf(m, "# tracer: %s\n", iter->trace->name); + seq_puts(m, "#\n"); + } + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return 0; + print_trace_header(m, iter); + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); + } else { + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_func_help_header(m); + } + } else { + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + print_lat_fmt(m, iter, iter->idx, iter->cpu); + else + print_trace_fmt(m, iter); + } + + return 0; +} + +static struct seq_operations tracer_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static struct trace_iterator notrace * +__tracing_open(struct inode *inode, struct file *file, int *ret) +{ + struct trace_iterator *iter; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + *ret = -ENOMEM; + goto out; + } + + mutex_lock(&trace_types_lock); + if (current_trace && current_trace->print_max) + iter->tr = &max_tr; + else + iter->tr = inode->i_private; + iter->trace = current_trace; + iter->pos = -1; + + /* TODO stop tracer */ + *ret = seq_open(file, &tracer_seq_ops); + if (!*ret) { + struct seq_file *m = file->private_data; + m->private = iter; + + /* stop the trace while dumping */ + if (iter->tr->ctrl) + tracer_enabled = 0; + + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + } else { + kfree(iter); + iter = NULL; + } + mutex_unlock(&trace_types_lock); + + out: + return iter; +} + +int tracing_open_generic(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + return 0; +} + +int tracing_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct trace_iterator *iter = m->private; + + mutex_lock(&trace_types_lock); + if (iter->trace && iter->trace->close) + iter->trace->close(iter); + + /* reenable tracing if it was previously enabled */ + if (iter->tr->ctrl) + tracer_enabled = 1; + mutex_unlock(&trace_types_lock); + + seq_release(inode, file); + kfree(iter); + return 0; +} + +static int tracing_open(struct inode *inode, struct file *file) +{ + int ret; + + __tracing_open(inode, file, &ret); + + return ret; +} + +static int tracing_lt_open(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter; + int ret; + + iter = __tracing_open(inode, file, &ret); + + if (!ret) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + + return ret; +} + + +static void notrace * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct tracer *t = m->private; + + (*pos)++; + + if (t) + t = t->next; + + m->private = t; + + return t; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct tracer *t = m->private; + loff_t l = 0; + + mutex_lock(&trace_types_lock); + for (; t && l < *pos; t = t_next(m, t, &l)) + ; + + return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&trace_types_lock); +} + +static int t_show(struct seq_file *m, void *v) +{ + struct tracer *t = v; + + if (!t) + return 0; + + seq_printf(m, "%s", t->name); + if (t->next) + seq_putc(m, ' '); + else + seq_putc(m, '\n'); + + return 0; +} + +static struct seq_operations show_traces_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int show_traces_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &show_traces_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = trace_types; + } + + return ret; +} + +static struct file_operations tracing_fops = { + .open = tracing_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release, +}; + +static struct file_operations tracing_lt_fops = { + .open = tracing_lt_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release, +}; + +static struct file_operations show_traces_fops = { + .open = show_traces_open, + .read = seq_read, + .release = seq_release, +}; + +static ssize_t +tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf; + int r = 0; + int len = 0; + int i; + + /* calulate max size */ + for (i = 0; trace_options[i]; i++) { + len += strlen(trace_options[i]); + len += 3; /* "no" and space */ + } + + /* +2 for \n and \0 */ + buf = kmalloc(len + 2, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + for (i = 0; trace_options[i]; i++) { + if (trace_flags & (1 << i)) + r += sprintf(buf + r, "%s ", trace_options[i]); + else + r += sprintf(buf + r, "no%s ", trace_options[i]); + } + + r += sprintf(buf + r, "\n"); + WARN_ON(r >= len + 2); + + r = simple_read_from_buffer(ubuf, cnt, ppos, + buf, r); + + kfree(buf); + + return r; +} + +static ssize_t +tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp = buf; + int neg = 0; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "no", 2) == 0) { + neg = 1; + cmp += 2; + } + + for (i = 0; trace_options[i]; i++) { + int len = strlen(trace_options[i]); + + if (strncmp(cmp, trace_options[i], len) == 0) { + if (neg) + trace_flags &= ~(1 << i); + else + trace_flags |= (1 << i); + break; + } + } + + filp->f_pos += cnt; + + return cnt; +} + +static struct file_operations tracing_iter_fops = { + .open = tracing_open_generic, + .read = tracing_iter_ctrl_read, + .write = tracing_iter_ctrl_write, +}; + +static ssize_t +tracing_ctrl_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%ld\n", tr->ctrl); + return simple_read_from_buffer(ubuf, cnt, ppos, + buf, r); +} + +static ssize_t +tracing_ctrl_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + long val; + char buf[64]; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + val = simple_strtoul(buf, NULL, 10); + + val = !!val; + + mutex_lock(&trace_types_lock); + if (tr->ctrl ^ val) { + if (val) + tracer_enabled = 1; + else + tracer_enabled = 0; + + tr->ctrl = val; + + if (current_trace && current_trace->ctrl_update) + current_trace->ctrl_update(tr); + } + mutex_unlock(&trace_types_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static ssize_t +tracing_set_trace_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[max_tracer_type_len+2]; + int r; + + mutex_lock(&trace_types_lock); + if (current_trace) + r = sprintf(buf, "%s\n", current_trace->name); + else + r = sprintf(buf, "\n"); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, + buf, r); +} + +static ssize_t +tracing_set_trace_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = &global_trace; + struct tracer *t; + char buf[max_tracer_type_len+1]; + int i; + + if (cnt > max_tracer_type_len) + cnt = max_tracer_type_len; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* strip ending whitespace. */ + for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) + buf[i] = 0; + + mutex_lock(&trace_types_lock); + for (t = trace_types; t; t = t->next) { + if (strcmp(t->name, buf) == 0) + break; + } + if (!t || t == current_trace) + goto out; + + if (current_trace && current_trace->reset) + current_trace->reset(tr); + + current_trace = t; + if (t->init) + t->init(tr); + + out: + mutex_unlock(&trace_types_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *ptr = filp->private_data; + char buf[64]; + int r; + + r = snprintf(buf, 64, "%ld\n", + *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); + if (r > 64) + r = 64; + return simple_read_from_buffer(ubuf, cnt, ppos, + buf, r); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long *ptr = filp->private_data; + long val; + char buf[64]; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + val = simple_strtoul(buf, NULL, 10); + + *ptr = val * 1000; + + return cnt; +} + +static struct file_operations tracing_max_lat_fops = { + .open = tracing_open_generic, + .read = tracing_max_lat_read, + .write = tracing_max_lat_write, +}; + +static struct file_operations tracing_ctrl_fops = { + .open = tracing_open_generic, + .read = tracing_ctrl_read, + .write = tracing_ctrl_write, +}; + +static struct file_operations set_tracer_fops = { + .open = tracing_open_generic, + .read = tracing_set_trace_read, + .write = tracing_set_trace_write, +}; + +#ifdef CONFIG_DYNAMIC_FTRACE + +static ssize_t +tracing_read_long(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *p = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%ld\n", *p); + return simple_read_from_buffer(ubuf, cnt, ppos, + buf, r); +} + +static struct file_operations tracing_read_long_fops = { + .open = tracing_open_generic, + .read = tracing_read_long, +}; +#endif + +static struct dentry *d_tracer; + +struct dentry *tracing_init_dentry(void) +{ + static int once; + + if (d_tracer) + return d_tracer; + + d_tracer = debugfs_create_dir("tracing", NULL); + + if (!d_tracer && !once) { + once = 1; + pr_warning("Could not create debugfs directory 'tracing'\n"); + return NULL; + } + + return d_tracer; +} + +static __init void tracer_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, + &global_trace, &tracing_ctrl_fops); + if (!entry) + pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); + + entry = debugfs_create_file("iter_ctrl", 0644, d_tracer, + NULL, &tracing_iter_fops); + if (!entry) + pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); + + entry = debugfs_create_file("latency_trace", 0444, d_tracer, + &global_trace, &tracing_lt_fops); + if (!entry) + pr_warning("Could not create debugfs 'latency_trace' entry\n"); + + entry = debugfs_create_file("trace", 0444, d_tracer, + &global_trace, &tracing_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("available_tracers", 0444, d_tracer, + &global_trace, &show_traces_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("current_tracer", 0444, d_tracer, + &global_trace, &set_tracer_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, + &tracing_max_latency, + &tracing_max_lat_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_max_latency' entry\n"); + + entry = debugfs_create_file("tracing_thresh", 0644, d_tracer, + &tracing_thresh, &tracing_max_lat_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); + +#ifdef CONFIG_DYNAMIC_FTRACE + entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, + &ftrace_update_tot_cnt, + &tracing_read_long_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'dyn_ftrace_total_info' entry\n"); +#endif +} + +/* dummy trace to disable tracing */ +static struct tracer no_tracer __read_mostly = +{ + .name = "none", +}; + +static inline notrace int page_order(const unsigned long size) +{ + const unsigned long nr_pages = DIV_ROUND_UP(size, PAGE_SIZE); + return ilog2(roundup_pow_of_two(nr_pages)); +} + +__init static int tracer_alloc_buffers(void) +{ + const int order = page_order(trace_nr_entries * TRACE_ENTRY_SIZE); + const unsigned long size = (1UL << order) << PAGE_SHIFT; + struct trace_entry *array; + int i; + + for_each_possible_cpu(i) { + global_trace.data[i] = &per_cpu(global_trace_cpu, i); + max_tr.data[i] = &per_cpu(max_data, i); + + array = (struct trace_entry *) + __get_free_pages(GFP_KERNEL, order); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate" + " %ld bytes for trace buffer!\n", size); + goto free_buffers; + } + global_trace.data[i]->trace = array; + +/* Only allocate if we are actually using the max trace */ +#ifdef CONFIG_TRACER_MAX_TRACE + array = (struct trace_entry *) + __get_free_pages(GFP_KERNEL, order); + if (array == NULL) { + printk(KERN_ERR "wakeup tracer: failed to allocate" + " %ld bytes for trace buffer!\n", size); + goto free_buffers; + } + max_tr.data[i]->trace = array; +#endif + } + + /* + * Since we allocate by orders of pages, we may be able to + * round up a bit. + */ + global_trace.entries = size / TRACE_ENTRY_SIZE; + max_tr.entries = global_trace.entries; + + pr_info("tracer: %ld bytes allocated for %ld", + size, trace_nr_entries); + pr_info(" entries of %ld bytes\n", (long)TRACE_ENTRY_SIZE); + pr_info(" actual entries %ld\n", global_trace.entries); + + tracer_init_debugfs(); + + trace_init_cmdlines(); + + register_tracer(&no_tracer); + current_trace = &no_tracer; + + return 0; + + free_buffers: + for (i-- ; i >= 0; i--) { + struct trace_array_cpu *data = global_trace.data[i]; + + if (data && data->trace) { + free_pages((unsigned long)data->trace, order); + data->trace = NULL; + } + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + if (data && data->trace) { + free_pages((unsigned long)data->trace, order); + data->trace = NULL; + } +#endif + } + return -ENOMEM; +} + +device_initcall(tracer_alloc_buffers); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h new file mode 100644 index 00000000000..3173a93561d --- /dev/null +++ b/kernel/trace/trace.h @@ -0,0 +1,184 @@ +#ifndef _LINUX_KERNEL_TRACE_H +#define _LINUX_KERNEL_TRACE_H + +#include +#include +#include +#include + +/* + * Function trace entry - function address and parent function addres: + */ +struct ftrace_entry { + unsigned long ip; + unsigned long parent_ip; +}; + +/* + * Context switch trace entry - which task (and prio) we switched from/to: + */ +struct ctx_switch_entry { + unsigned int prev_pid; + unsigned char prev_prio; + unsigned char prev_state; + unsigned int next_pid; + unsigned char next_prio; +}; + +/* + * The trace entry - the most basic unit of tracing. This is what + * is printed in the end as a single line in the trace output, such as: + * + * bash-15816 [01] 235.197585: idle_cpu <- irq_enter + */ +struct trace_entry { + char type; + char cpu; + char flags; + char preempt_count; + int pid; + cycle_t t; + unsigned long idx; + union { + struct ftrace_entry fn; + struct ctx_switch_entry ctx; + }; +}; + +#define TRACE_ENTRY_SIZE sizeof(struct trace_entry) + +/* + * The CPU trace array - it consists of thousands of trace entries + * plus some other descriptor data: (for example which task started + * the trace, etc.) + */ +struct trace_array_cpu { + void *trace; + unsigned long trace_idx; + atomic_t disabled; + atomic_t underrun; + unsigned long saved_latency; + unsigned long critical_start; + unsigned long critical_end; + unsigned long critical_sequence; + unsigned long nice; + unsigned long policy; + unsigned long rt_priority; + cycle_t preempt_timestamp; + pid_t pid; + uid_t uid; + char comm[TASK_COMM_LEN]; +}; + +struct trace_iterator; + +/* + * The trace array - an array of per-CPU trace arrays. This is the + * highest level data structure that individual tracers deal with. + * They have on/off state as well: + */ +struct trace_array { + unsigned long entries; + long ctrl; + int cpu; + cycle_t time_start; + struct trace_array_cpu *data[NR_CPUS]; +}; + +/* + * A specific tracer, represented by methods that operate on a trace array: + */ +struct tracer { + const char *name; + void (*init)(struct trace_array *tr); + void (*reset)(struct trace_array *tr); + void (*open)(struct trace_iterator *iter); + void (*close)(struct trace_iterator *iter); + void (*start)(struct trace_iterator *iter); + void (*stop)(struct trace_iterator *iter); + void (*ctrl_update)(struct trace_array *tr); + struct tracer *next; + int print_max; +}; + +/* + * Trace iterator - used by printout routines who present trace + * results to users and which routines might sleep, etc: + */ +struct trace_iterator { + struct trace_array *tr; + struct tracer *trace; + struct trace_entry *ent; + unsigned long iter_flags; + loff_t pos; + unsigned long next_idx[NR_CPUS]; + int cpu; + int idx; +}; + +void notrace tracing_reset(struct trace_array_cpu *data); +int tracing_open_generic(struct inode *inode, struct file *filp); +struct dentry *tracing_init_dentry(void); +void ftrace(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags); +void tracing_sched_switch_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags); +void tracing_record_cmdline(struct task_struct *tsk); + +void tracing_start_function_trace(void); +void tracing_stop_function_trace(void); +int register_tracer(struct tracer *type); +void unregister_tracer(struct tracer *type); + +extern unsigned long nsecs_to_usecs(unsigned long nsecs); + +extern unsigned long tracing_max_latency; +extern unsigned long tracing_thresh; + +void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); +void update_max_tr_single(struct trace_array *tr, + struct task_struct *tsk, int cpu); + +static inline notrace cycle_t now(int cpu) +{ + return cpu_clock(cpu); +} + +#ifdef CONFIG_SCHED_TRACER +extern void notrace +wakeup_sched_switch(struct task_struct *prev, struct task_struct *next); +#else +static inline void +wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) +{ +} +#endif + +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +typedef void +(*tracer_switch_func_t)(void *private, + struct task_struct *prev, + struct task_struct *next); + +struct tracer_switch_ops { + tracer_switch_func_t func; + void *private; + struct tracer_switch_ops *next; +}; + +extern int register_tracer_switch(struct tracer_switch_ops *ops); +extern int unregister_tracer_switch(struct tracer_switch_ops *ops); + +#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ + +#ifdef CONFIG_DYNAMIC_FTRACE +extern unsigned long ftrace_update_tot_cnt; +#endif + +#endif /* _LINUX_KERNEL_TRACE_H */ -- cgit v1.2.3-70-g09d2 From 1d09daa55d2e9bab7e7d30f0d05e5a7bc60b2a4a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:55 +0200 Subject: ftrace: use Makefile to remove tracing from lockdep This patch removes the "notrace" annotation from lockdep and adds the debugging files in the kernel director to those that should not be compiled with "-pg" mcount tracing. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/Makefile | 8 ++++++++ kernel/lockdep.c | 26 +++++++++++++------------- 2 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index 7e344e7b0cb..d2f80ea4cd9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,6 +11,14 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o +ifdef CONFIG_FTRACE +# Do not profile debug utilities +ORIG_CFLAGS := $(KBUILD_CFLAGS) +KBUILD_CFLAGS = $(if $(filter-out lockdep% %debug,$(basename $(notdir $@))), \ + $(ORIG_CFLAGS), \ + $(subst -pg,,$(ORIG_CFLAGS))) +endif + obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index ac46847ba0c..90a440cbd6d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -271,14 +271,14 @@ static struct list_head chainhash_table[CHAINHASH_SIZE]; ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ (key2)) -notrace void lockdep_off(void) +void lockdep_off(void) { current->lockdep_recursion++; } EXPORT_SYMBOL(lockdep_off); -notrace void lockdep_on(void) +void lockdep_on(void) { current->lockdep_recursion--; } @@ -1041,7 +1041,7 @@ find_usage_forwards(struct lock_class *source, unsigned int depth) * Return 1 otherwise and keep unchanged. * Return 0 on error. */ -static noinline notrace int +static noinline int find_usage_backwards(struct lock_class *source, unsigned int depth) { struct lock_list *entry; @@ -1591,7 +1591,7 @@ static inline int validate_chain(struct task_struct *curr, * We are building curr_chain_key incrementally, so double-check * it from scratch, to make sure that it's done correctly: */ -static notrace void check_chain_key(struct task_struct *curr) +static void check_chain_key(struct task_struct *curr) { #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; @@ -1967,7 +1967,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, /* * Mark all held locks with a usage bit: */ -static notrace int +static int mark_held_locks(struct task_struct *curr, int hardirq) { enum lock_usage_bit usage_bit; @@ -2014,7 +2014,7 @@ void early_boot_irqs_on(void) /* * Hardirqs will be enabled: */ -void notrace trace_hardirqs_on_caller(unsigned long a0) +void trace_hardirqs_on_caller(unsigned long a0) { struct task_struct *curr = current; unsigned long ip; @@ -2060,7 +2060,7 @@ void notrace trace_hardirqs_on_caller(unsigned long a0) } EXPORT_SYMBOL(trace_hardirqs_on_caller); -void notrace trace_hardirqs_on(void) +void trace_hardirqs_on(void) { trace_hardirqs_on_caller(CALLER_ADDR0); } @@ -2069,7 +2069,7 @@ EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void notrace trace_hardirqs_off_caller(unsigned long a0) +void trace_hardirqs_off_caller(unsigned long a0) { struct task_struct *curr = current; @@ -2094,7 +2094,7 @@ void notrace trace_hardirqs_off_caller(unsigned long a0) } EXPORT_SYMBOL(trace_hardirqs_off_caller); -void notrace trace_hardirqs_off(void) +void trace_hardirqs_off(void) { trace_hardirqs_off_caller(CALLER_ADDR0); } @@ -2260,7 +2260,7 @@ static inline int separate_irq_context(struct task_struct *curr, /* * Mark a lock with a usage bit, and validate the state transition: */ -static notrace int mark_lock(struct task_struct *curr, struct held_lock *this, +static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { unsigned int new_mask = 1 << new_bit, ret = 1; @@ -2663,7 +2663,7 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) /* * Check whether we follow the irq-flags state precisely: */ -static notrace void check_flags(unsigned long flags) +static void check_flags(unsigned long flags) { #if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS) if (!debug_locks) @@ -2700,7 +2700,7 @@ static notrace void check_flags(unsigned long flags) * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: */ -notrace void lock_acquire(struct lockdep_map *lock, unsigned int subclass, +void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, unsigned long ip) { unsigned long flags; @@ -2723,7 +2723,7 @@ notrace void lock_acquire(struct lockdep_map *lock, unsigned int subclass, EXPORT_SYMBOL_GPL(lock_acquire); -notrace void lock_release(struct lockdep_map *lock, int nested, +void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { unsigned long flags; -- cgit v1.2.3-70-g09d2 From 6ec562328fda585be2d7f472cfac99d3b44d362a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 May 2008 21:30:30 -0400 Subject: ftrace: use the new kbuild CFLAGS_REMOVE for kernel directory This patch removes the Makefile turd and uses the nice CFLAGS_REMOVE macro in the kernel directory. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/Makefile | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index d2f80ea4cd9..ca2433e8487 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,12 +11,16 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o +CFLAGS_REMOVE_sched.o = -pg -mno-spe + ifdef CONFIG_FTRACE -# Do not profile debug utilities -ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(if $(filter-out lockdep% %debug,$(basename $(notdir $@))), \ - $(ORIG_CFLAGS), \ - $(subst -pg,,$(ORIG_CFLAGS))) +# Do not trace debug files and internal ftrace files +CFLAGS_REMOVE_lockdep.o = -pg +CFLAGS_REMOVE_lockdep_proc.o = -pg +CFLAGS_REMOVE_mutex-debug.o = -pg +CFLAGS_REMOVE_rtmutex-debug.o = -pg +CFLAGS_REMOVE_cgroup-debug.o = -pg +CFLAGS_REMOVE_sched_clock.o = -pg endif obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o -- cgit v1.2.3-70-g09d2 From 6e0534f278199f1e3dd1049b9bc19a7a5b87ada1 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 12 May 2008 21:21:01 +0200 Subject: sched: use a 2-d bitmap for searching lowest-pri CPU The current code use a linear algorithm which causes scaling issues on larger SMP machines. This patch replaces that algorithm with a 2-dimensional bitmap to reduce latencies in the wake-up path. Signed-off-by: Gregory Haskins Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/Makefile | 1 + kernel/sched.c | 7 ++ kernel/sched_cpupri.c | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched_cpupri.h | 36 +++++++++++ kernel/sched_rt.c | 98 ++++++---------------------- 5 files changed, 239 insertions(+), 77 deletions(-) create mode 100644 kernel/sched_cpupri.c create mode 100644 kernel/sched_cpupri.h (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index 1c9938addb9..ecdd2d33563 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_SMP) += sched_cpupri.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/sched.c b/kernel/sched.c index aa960b84b88..8a1257b6556 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -74,6 +74,8 @@ #include #include +#include "sched_cpupri.h" + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -450,6 +452,9 @@ struct root_domain { */ cpumask_t rto_mask; atomic_t rto_count; +#ifdef CONFIG_SMP + struct cpupri cpupri; +#endif }; /* @@ -6392,6 +6397,8 @@ static void init_rootdomain(struct root_domain *rd) cpus_clear(rd->span); cpus_clear(rd->online); + + cpupri_init(&rd->cpupri); } static void init_defrootdomain(void) diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c new file mode 100644 index 00000000000..52154fefab7 --- /dev/null +++ b/kernel/sched_cpupri.c @@ -0,0 +1,174 @@ +/* + * kernel/sched_cpupri.c + * + * CPU priority management + * + * Copyright (C) 2007-2008 Novell + * + * Author: Gregory Haskins + * + * This code tracks the priority of each CPU so that global migration + * decisions are easy to calculate. Each CPU can be in a state as follows: + * + * (INVALID), IDLE, NORMAL, RT1, ... RT99 + * + * going from the lowest priority to the highest. CPUs in the INVALID state + * are not eligible for routing. The system maintains this state with + * a 2 dimensional bitmap (the first for priority class, the second for cpus + * in that class). Therefore a typical application without affinity + * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit + * searches). For tasks with affinity restrictions, the algorithm has a + * worst case complexity of O(min(102, nr_domcpus)), though the scenario that + * yields the worst case search is fairly contrived. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include "sched_cpupri.h" + +/* Convert between a 140 based task->prio, and our 102 based cpupri */ +static int convert_prio(int prio) +{ + int cpupri; + + if (prio == CPUPRI_INVALID) + cpupri = CPUPRI_INVALID; + else if (prio == MAX_PRIO) + cpupri = CPUPRI_IDLE; + else if (prio >= MAX_RT_PRIO) + cpupri = CPUPRI_NORMAL; + else + cpupri = MAX_RT_PRIO - prio + 1; + + return cpupri; +} + +#define for_each_cpupri_active(array, idx) \ + for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ + idx < CPUPRI_NR_PRIORITIES; \ + idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1)) + +/** + * cpupri_find - find the best (lowest-pri) CPU in the system + * @cp: The cpupri context + * @p: The task + * @lowest_mask: A mask to fill in with selected CPUs + * + * Note: This function returns the recommended CPUs as calculated during the + * current invokation. By the time the call returns, the CPUs may have in + * fact changed priorities any number of times. While not ideal, it is not + * an issue of correctness since the normal rebalancer logic will correct + * any discrepancies created by racing against the uncertainty of the current + * priority configuration. + * + * Returns: (int)bool - CPUs were found + */ +int cpupri_find(struct cpupri *cp, struct task_struct *p, + cpumask_t *lowest_mask) +{ + int idx = 0; + int task_pri = convert_prio(p->prio); + + for_each_cpupri_active(cp->pri_active, idx) { + struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + cpumask_t mask; + + if (idx >= task_pri) + break; + + cpus_and(mask, p->cpus_allowed, vec->mask); + + if (cpus_empty(mask)) + continue; + + *lowest_mask = mask; + return 1; + } + + return 0; +} + +/** + * cpupri_set - update the cpu priority setting + * @cp: The cpupri context + * @cpu: The target cpu + * @pri: The priority (INVALID-RT99) to assign to this CPU + * + * Note: Assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpupri_set(struct cpupri *cp, int cpu, int newpri) +{ + int *currpri = &cp->cpu_to_pri[cpu]; + int oldpri = *currpri; + unsigned long flags; + + newpri = convert_prio(newpri); + + BUG_ON(newpri >= CPUPRI_NR_PRIORITIES); + + if (newpri == oldpri) + return; + + /* + * If the cpu was currently mapped to a different value, we + * first need to unmap the old value + */ + if (likely(oldpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; + + spin_lock_irqsave(&vec->lock, flags); + + vec->count--; + if (!vec->count) + clear_bit(oldpri, cp->pri_active); + cpu_clear(cpu, vec->mask); + + spin_unlock_irqrestore(&vec->lock, flags); + } + + if (likely(newpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; + + spin_lock_irqsave(&vec->lock, flags); + + cpu_set(cpu, vec->mask); + vec->count++; + if (vec->count == 1) + set_bit(newpri, cp->pri_active); + + spin_unlock_irqrestore(&vec->lock, flags); + } + + *currpri = newpri; +} + +/** + * cpupri_init - initialize the cpupri structure + * @cp: The cpupri context + * + * Returns: (void) + */ +void cpupri_init(struct cpupri *cp) +{ + int i; + + memset(cp, 0, sizeof(*cp)); + + for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { + struct cpupri_vec *vec = &cp->pri_to_cpu[i]; + + spin_lock_init(&vec->lock); + vec->count = 0; + cpus_clear(vec->mask); + } + + for_each_possible_cpu(i) + cp->cpu_to_pri[i] = CPUPRI_INVALID; +} + + diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h new file mode 100644 index 00000000000..0b6a3d110fa --- /dev/null +++ b/kernel/sched_cpupri.h @@ -0,0 +1,36 @@ +#ifndef _LINUX_CPUPRI_H +#define _LINUX_CPUPRI_H + +#include + +#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO +#define CPUPRI_NR_PRI_WORDS CPUPRI_NR_PRIORITIES/BITS_PER_LONG + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { + spinlock_t lock; + int count; + cpumask_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + long pri_active[CPUPRI_NR_PRI_WORDS]; + int cpu_to_pri[NR_CPUS]; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, + struct task_struct *p, cpumask_t *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +void cpupri_init(struct cpupri *cp); +#else +#define cpupri_set(cp, cpu, pri) do { } while (0) +#define cpupri_init() do { } while (0) +#endif + +#endif /* _LINUX_CPUPRI_H */ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index fefed39fafd..44b06d75416 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -391,8 +391,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_prio(rt_se_prio(rt_se))); rt_rq->rt_nr_running++; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - if (rt_se_prio(rt_se) < rt_rq->highest_prio) + if (rt_se_prio(rt_se) < rt_rq->highest_prio) { + struct rq *rq = rq_of_rt_rq(rt_rq); rt_rq->highest_prio = rt_se_prio(rt_se); + cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se)); + } #endif #ifdef CONFIG_SMP if (rt_se->nr_cpus_allowed > 1) { @@ -416,6 +419,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static inline void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { +#ifdef CONFIG_SMP + int highest_prio = rt_rq->highest_prio; +#endif + WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); rt_rq->rt_nr_running--; @@ -439,6 +446,11 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rq->rt.rt_nr_migratory--; } + if (rt_rq->highest_prio != highest_prio) { + struct rq *rq = rq_of_rt_rq(rt_rq); + cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio); + } + update_rt_migration(rq_of_rt_rq(rt_rq)); #endif /* CONFIG_SMP */ #ifdef CONFIG_RT_GROUP_SCHED @@ -763,73 +775,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); -static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask) -{ - int lowest_prio = -1; - int lowest_cpu = -1; - int count = 0; - int cpu; - - cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed); - - /* - * Scan each rq for the lowest prio. - */ - for_each_cpu_mask(cpu, *lowest_mask) { - struct rq *rq = cpu_rq(cpu); - - /* We look for lowest RT prio or non-rt CPU */ - if (rq->rt.highest_prio >= MAX_RT_PRIO) { - /* - * if we already found a low RT queue - * and now we found this non-rt queue - * clear the mask and set our bit. - * Otherwise just return the queue as is - * and the count==1 will cause the algorithm - * to use the first bit found. - */ - if (lowest_cpu != -1) { - cpus_clear(*lowest_mask); - cpu_set(rq->cpu, *lowest_mask); - } - return 1; - } - - /* no locking for now */ - if ((rq->rt.highest_prio > task->prio) - && (rq->rt.highest_prio >= lowest_prio)) { - if (rq->rt.highest_prio > lowest_prio) { - /* new low - clear old data */ - lowest_prio = rq->rt.highest_prio; - lowest_cpu = cpu; - count = 0; - } - count++; - } else - cpu_clear(cpu, *lowest_mask); - } - - /* - * Clear out all the set bits that represent - * runqueues that were of higher prio than - * the lowest_prio. - */ - if (lowest_cpu > 0) { - /* - * Perhaps we could add another cpumask op to - * zero out bits. Like cpu_zero_bits(cpumask, nrbits); - * Then that could be optimized to use memset and such. - */ - for_each_cpu_mask(cpu, *lowest_mask) { - if (cpu >= lowest_cpu) - break; - cpu_clear(cpu, *lowest_mask); - } - } - - return count; -} - static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) { int first; @@ -851,17 +796,12 @@ static int find_lowest_rq(struct task_struct *task) cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); - int count = find_lowest_cpus(task, lowest_mask); - if (!count) - return -1; /* No targets found */ + if (task->rt.nr_cpus_allowed == 1) + return -1; /* No other targets possible */ - /* - * There is no sense in performing an optimal search if only one - * target is found. - */ - if (count == 1) - return first_cpu(*lowest_mask); + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + return -1; /* No targets found */ /* * At this point we have built a mask of cpus representing the @@ -1218,6 +1158,8 @@ static void join_domain_rt(struct rq *rq) { if (rq->rt.overloaded) rt_set_overload(rq); + + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); } /* Assumes rq->lock is held */ @@ -1225,6 +1167,8 @@ static void leave_domain_rt(struct rq *rq) { if (rq->rt.overloaded) rt_clear_overload(rq); + + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); } /* -- cgit v1.2.3-70-g09d2 From 68f4f1ec08e3d95730a2693b99df8260aa0d06ae Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Thu, 29 May 2008 11:17:02 -0700 Subject: sched: Move cpu masks from kernel/sched.c into kernel/cpu.c kernel/cpu.c seems a more logical place for those maps since they do not really have much to do with the scheduler these days. kernel/cpu.c is now built for the UP kernel too, but it does not affect the size the kernel sections. $ size vmlinux before text data bss dec hex filename 3313797 307060 310352 3931209 3bfc49 vmlinux after text data bss dec hex filename 3313797 307060 310352 3931209 3bfc49 vmlinux Signed-off-by: Max Krasnyansky Cc: pj@sgi.com Cc: menage@google.com Cc: rostedt@goodmis.org Cc: mingo@elte.hu Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/Makefile | 4 ++-- kernel/cpu.c | 24 ++++++++++++++++++++++++ kernel/sched.c | 18 ------------------ 3 files changed, 26 insertions(+), 20 deletions(-) (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index ecdd2d33563..6c55301112e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -3,7 +3,7 @@ # obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ - exit.o itimer.o time.o softirq.o resource.o \ + cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ @@ -27,7 +27,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -obj-$(CONFIG_SMP) += cpu.o spinlock.o +obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o diff --git a/kernel/cpu.c b/kernel/cpu.c index c77bc3a1c72..b11f06dc149 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -15,6 +15,28 @@ #include #include +/* + * Represents all cpu's present in the system + * In systems capable of hotplug, this map could dynamically grow + * as new cpu's are detected in the system via any platform specific + * method, such as ACPI for e.g. + */ +cpumask_t cpu_present_map __read_mostly; +EXPORT_SYMBOL(cpu_present_map); + +#ifndef CONFIG_SMP + +/* + * Represents all cpu's that are currently online. + */ +cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_online_map); + +cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_possible_map); + +#else /* CONFIG_SMP */ + /* Serializes the updates to cpu_online_map, cpu_present_map */ static DEFINE_MUTEX(cpu_add_remove_lock); @@ -403,3 +425,5 @@ out: cpu_maps_update_done(); } #endif /* CONFIG_PM_SLEEP_SMP */ + +#endif /* CONFIG_SMP */ diff --git a/kernel/sched.c b/kernel/sched.c index 1ddb0a8c797..f36f549e574 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5080,24 +5080,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, return sched_setaffinity(pid, &new_mask); } -/* - * Represents all cpu's present in the system - * In systems capable of hotplug, this map could dynamically grow - * as new cpu's are detected in the system via any platform specific - * method, such as ACPI for e.g. - */ - -cpumask_t cpu_present_map __read_mostly; -EXPORT_SYMBOL(cpu_present_map); - -#ifndef CONFIG_SMP -cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_online_map); - -cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_possible_map); -#endif - long sched_getaffinity(pid_t pid, cpumask_t *mask) { struct task_struct *p; -- cgit v1.2.3-70-g09d2 From 3d4422332711ef48ef0f132f1fcbfcbd56c7f3d1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jun 2008 11:21:34 +0200 Subject: Add generic helpers for arch IPI function calls This adds kernel/smp.c which contains helpers for IPI function calls. In addition to supporting the existing smp_call_function() in a more efficient manner, it also adds a more scalable variant called smp_call_function_single() for calling a given function on a single CPU only. The core of this is based on the x86-64 patch from Nick Piggin, lots of changes since then. "Alan D. Brunelle" has contributed lots of fixes and suggestions as well. Also thanks to Paul E. McKenney for reviewing RCU usage and getting rid of the data allocation fallback deadlock. Acked-by: Ingo Molnar Reviewed-by: Paul E. McKenney Signed-off-by: Jens Axboe --- arch/Kconfig | 3 + arch/sparc64/kernel/smp.c | 11 +- include/linux/smp.h | 35 ++++- init/main.c | 2 + kernel/Makefile | 1 + kernel/smp.c | 383 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 428 insertions(+), 7 deletions(-) create mode 100644 kernel/smp.c (limited to 'kernel/Makefile') diff --git a/arch/Kconfig b/arch/Kconfig index 3ea332b009e..ad89a33d8c6 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -39,3 +39,6 @@ config HAVE_KRETPROBES config HAVE_DMA_ATTRS def_bool n + +config USE_GENERIC_SMP_HELPERS + def_bool n diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index fa63c68a181..b82d017a174 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -816,8 +816,9 @@ extern unsigned long xcall_call_function; * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ -static int smp_call_function_mask(void (*func)(void *info), void *info, - int nonatomic, int wait, cpumask_t mask) +static int sparc64_smp_call_function_mask(void (*func)(void *info), void *info, + int nonatomic, int wait, + cpumask_t mask) { struct call_data_struct data; int cpus; @@ -855,8 +856,8 @@ out_unlock: int smp_call_function(void (*func)(void *info), void *info, int nonatomic, int wait) { - return smp_call_function_mask(func, info, nonatomic, wait, - cpu_online_map); + return sparc64_smp_call_function_mask(func, info, nonatomic, wait, + cpu_online_map); } void smp_call_function_client(int irq, struct pt_regs *regs) @@ -893,7 +894,7 @@ static void tsb_sync(void *info) void smp_tsb_sync(struct mm_struct *mm) { - smp_call_function_mask(tsb_sync, mm, 0, 1, mm->cpu_vm_mask); + sparc64_smp_call_function_mask(tsb_sync, mm, 0, 1, mm->cpu_vm_mask); } extern unsigned long xcall_flush_tlb_mm; diff --git a/include/linux/smp.h b/include/linux/smp.h index 55232ccf9cf..eac3e062250 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -7,9 +7,19 @@ */ #include +#include +#include +#include extern void cpu_idle(void); +struct call_single_data { + struct list_head list; + void (*func) (void *info); + void *info; + unsigned int flags; +}; + #ifdef CONFIG_SMP #include @@ -53,9 +63,28 @@ extern void smp_cpus_done(unsigned int max_cpus); * Call a function on all other processors */ int smp_call_function(void(*func)(void *info), void *info, int retry, int wait); - +int smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info, + int wait); int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, int retry, int wait); +void __smp_call_function_single(int cpuid, struct call_single_data *data); + +/* + * Generic and arch helpers + */ +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS +void generic_smp_call_function_single_interrupt(void); +void generic_smp_call_function_interrupt(void); +void init_call_single_data(void); +void ipi_call_lock(void); +void ipi_call_unlock(void); +void ipi_call_lock_irq(void); +void ipi_call_unlock_irq(void); +#else +static inline void init_call_single_data(void) +{ +} +#endif /* * Call a function on all processors @@ -112,7 +141,9 @@ static inline void smp_send_reschedule(int cpu) { } }) #define smp_call_function_mask(mask, func, info, wait) \ (up_smp_call_function(func, info)) - +static inline void init_call_single_data(void) +{ +} #endif /* !SMP */ /* diff --git a/init/main.c b/init/main.c index f7fb20021d4..1efcccff1bd 100644 --- a/init/main.c +++ b/init/main.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -779,6 +780,7 @@ static void __init do_pre_smp_initcalls(void) { extern int spawn_ksoftirqd(void); + init_call_single_data(); migration_init(); spawn_ksoftirqd(); if (!nosoftlockup) diff --git a/kernel/Makefile b/kernel/Makefile index 1c9938addb9..9fa57976f25 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o +obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o diff --git a/kernel/smp.c b/kernel/smp.c new file mode 100644 index 00000000000..f77b75c027a --- /dev/null +++ b/kernel/smp.c @@ -0,0 +1,383 @@ +/* + * Generic helpers for smp ipi calls + * + * (C) Jens Axboe 2008 + * + */ +#include +#include +#include +#include +#include + +static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); +static LIST_HEAD(call_function_queue); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock); + +enum { + CSD_FLAG_WAIT = 0x01, + CSD_FLAG_ALLOC = 0x02, +}; + +struct call_function_data { + struct call_single_data csd; + spinlock_t lock; + unsigned int refs; + cpumask_t cpumask; + struct rcu_head rcu_head; +}; + +struct call_single_queue { + struct list_head list; + spinlock_t lock; +}; + +void __cpuinit init_call_single_data(void) +{ + int i; + + for_each_possible_cpu(i) { + struct call_single_queue *q = &per_cpu(call_single_queue, i); + + spin_lock_init(&q->lock); + INIT_LIST_HEAD(&q->list); + } +} + +static void csd_flag_wait(struct call_single_data *data) +{ + /* Wait for response */ + do { + /* + * We need to see the flags store in the IPI handler + */ + smp_mb(); + if (!(data->flags & CSD_FLAG_WAIT)) + break; + cpu_relax(); + } while (1); +} + +/* + * Insert a previously allocated call_single_data element for execution + * on the given CPU. data must already have ->func, ->info, and ->flags set. + */ +static void generic_exec_single(int cpu, struct call_single_data *data) +{ + struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); + int wait = data->flags & CSD_FLAG_WAIT, ipi; + unsigned long flags; + + spin_lock_irqsave(&dst->lock, flags); + ipi = list_empty(&dst->list); + list_add_tail(&data->list, &dst->list); + spin_unlock_irqrestore(&dst->lock, flags); + + if (ipi) + arch_send_call_function_single_ipi(cpu); + + if (wait) + csd_flag_wait(data); +} + +static void rcu_free_call_data(struct rcu_head *head) +{ + struct call_function_data *data; + + data = container_of(head, struct call_function_data, rcu_head); + + kfree(data); +} + +/* + * Invoked by arch to handle an IPI for call function. Must be called with + * interrupts disabled. + */ +void generic_smp_call_function_interrupt(void) +{ + struct call_function_data *data; + int cpu = get_cpu(); + + /* + * It's ok to use list_for_each_rcu() here even though we may delete + * 'pos', since list_del_rcu() doesn't clear ->next + */ + rcu_read_lock(); + list_for_each_entry_rcu(data, &call_function_queue, csd.list) { + int refs; + + if (!cpu_isset(cpu, data->cpumask)) + continue; + + data->csd.func(data->csd.info); + + spin_lock(&data->lock); + cpu_clear(cpu, data->cpumask); + WARN_ON(data->refs == 0); + data->refs--; + refs = data->refs; + spin_unlock(&data->lock); + + if (refs) + continue; + + spin_lock(&call_function_lock); + list_del_rcu(&data->csd.list); + spin_unlock(&call_function_lock); + + if (data->csd.flags & CSD_FLAG_WAIT) { + /* + * serialize stores to data with the flag clear + * and wakeup + */ + smp_wmb(); + data->csd.flags &= ~CSD_FLAG_WAIT; + } else + call_rcu(&data->rcu_head, rcu_free_call_data); + } + rcu_read_unlock(); + + put_cpu(); +} + +/* + * Invoked by arch to handle an IPI for call function single. Must be called + * from the arch with interrupts disabled. + */ +void generic_smp_call_function_single_interrupt(void) +{ + struct call_single_queue *q = &__get_cpu_var(call_single_queue); + LIST_HEAD(list); + + /* + * Need to see other stores to list head for checking whether + * list is empty without holding q->lock + */ + smp_mb(); + while (!list_empty(&q->list)) { + unsigned int data_flags; + + spin_lock(&q->lock); + list_replace_init(&q->list, &list); + spin_unlock(&q->lock); + + while (!list_empty(&list)) { + struct call_single_data *data; + + data = list_entry(list.next, struct call_single_data, + list); + list_del(&data->list); + + /* + * 'data' can be invalid after this call if + * flags == 0 (when called through + * generic_exec_single(), so save them away before + * making the call. + */ + data_flags = data->flags; + + data->func(data->info); + + if (data_flags & CSD_FLAG_WAIT) { + smp_wmb(); + data->flags &= ~CSD_FLAG_WAIT; + } else if (data_flags & CSD_FLAG_ALLOC) + kfree(data); + } + /* + * See comment on outer loop + */ + smp_mb(); + } +} + +/* + * smp_call_function_single - Run a function on a specific CPU + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @retry: Unused + * @wait: If true, wait until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. Note that @wait + * will be implicitly turned on in case of allocation failures, since + * we fall back to on-stack allocation. + */ +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int retry, int wait) +{ + struct call_single_data d; + unsigned long flags; + /* prevent preemption and reschedule on another processor */ + int me = get_cpu(); + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + if (cpu == me) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); + } else { + struct call_single_data *data = NULL; + + if (!wait) { + data = kmalloc(sizeof(*data), GFP_ATOMIC); + if (data) + data->flags = CSD_FLAG_ALLOC; + } + if (!data) { + data = &d; + data->flags = CSD_FLAG_WAIT; + } + + data->func = func; + data->info = info; + generic_exec_single(cpu, data); + } + + put_cpu(); + return 0; +} +EXPORT_SYMBOL(smp_call_function_single); + +/** + * __smp_call_function_single(): Run a function on another CPU + * @cpu: The CPU to run on. + * @data: Pre-allocated and setup data structure + * + * Like smp_call_function_single(), but allow caller to pass in a pre-allocated + * data structure. Useful for embedding @data inside other structures, for + * instance. + * + */ +void __smp_call_function_single(int cpu, struct call_single_data *data) +{ + /* Can deadlock when called with interrupts disabled */ + WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled()); + + generic_exec_single(cpu, data); +} + +/** + * smp_call_function_mask(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned. Note that @wait + * will be implicitly turned on in case of allocation failures, since + * we fall back to on-stack allocation. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. Preemption + * must be disabled when calling this function. + */ +int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, + int wait) +{ + struct call_function_data d; + struct call_function_data *data = NULL; + cpumask_t allbutself; + unsigned long flags; + int cpu, num_cpus; + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + cpu = smp_processor_id(); + allbutself = cpu_online_map; + cpu_clear(cpu, allbutself); + cpus_and(mask, mask, allbutself); + num_cpus = cpus_weight(mask); + + /* + * If zero CPUs, return. If just a single CPU, turn this request + * into a targetted single call instead since it's faster. + */ + if (!num_cpus) + return 0; + else if (num_cpus == 1) { + cpu = first_cpu(mask); + return smp_call_function_single(cpu, func, info, 0, wait); + } + + if (!wait) { + data = kmalloc(sizeof(*data), GFP_ATOMIC); + if (data) + data->csd.flags = CSD_FLAG_ALLOC; + } + if (!data) { + data = &d; + data->csd.flags = CSD_FLAG_WAIT; + } + + spin_lock_init(&data->lock); + data->csd.func = func; + data->csd.info = info; + data->refs = num_cpus; + data->cpumask = mask; + + spin_lock_irqsave(&call_function_lock, flags); + list_add_tail_rcu(&data->csd.list, &call_function_queue); + spin_unlock_irqrestore(&call_function_lock, flags); + + /* Send a message to all CPUs in the map */ + arch_send_call_function_ipi(mask); + + /* optionally wait for the CPUs to complete */ + if (wait) + csd_flag_wait(&data->csd); + + return 0; +} +EXPORT_SYMBOL(smp_call_function_mask); + +/** + * smp_call_function(): Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @natomic: Unused + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. In case of allocation + * failure, @wait will be implicitly turned on. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function(void (*func)(void *), void *info, int natomic, int wait) +{ + int ret; + + preempt_disable(); + ret = smp_call_function_mask(cpu_online_map, func, info, wait); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL(smp_call_function); + +void ipi_call_lock(void) +{ + spin_lock(&call_function_lock); +} + +void ipi_call_unlock(void) +{ + spin_unlock(&call_function_lock); +} + +void ipi_call_lock_irq(void) +{ + spin_lock_irq(&call_function_lock); +} + +void ipi_call_unlock_irq(void) +{ + spin_unlock_irq(&call_function_lock); +} -- cgit v1.2.3-70-g09d2 From 1e16c0a081f6c93f04c6af784d6a160955269f91 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:11 -0400 Subject: ftrace: trace schedule After the sched_clock code has been removed from sched.c we can now trace the scheduler. The scheduler has a lot of functions that would be worth tracing. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index ca2433e8487..480976275d9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,7 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o -CFLAGS_REMOVE_sched.o = -pg -mno-spe +CFLAGS_REMOVE_sched.o = -mno-spe ifdef CONFIG_FTRACE # Do not trace debug files and internal ftrace files -- cgit v1.2.3-70-g09d2 From c349e0a01c3e0f70913db6a5bb61ab204e0602de Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 15 Apr 2008 22:39:31 +0200 Subject: ftrace: do not trace scheduler functions do not trace scheduler functions - it's still a bit fragile and can lock up with: http://redhat.com/~mingo/misc/config-Thu_Jul_17_13_34_52_CEST_2008 Signed-off-by: Ingo Molnar --- kernel/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index 0a7ed838984..985ddb7da4d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,8 +11,6 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o -CFLAGS_REMOVE_sched.o = -mno-spe - ifdef CONFIG_FTRACE # Do not trace debug files and internal ftrace files CFLAGS_REMOVE_lockdep.o = -pg @@ -21,6 +19,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg +CFLAGS_REMOVE_sched.o = -mno-spe -pg endif obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o -- cgit v1.2.3-70-g09d2