summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/cgroup.c8
-rw-r--r--kernel/exit.c9
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/futex.c20
-rw-r--r--kernel/hw_breakpoint.c423
-rw-r--r--kernel/irq/spurious.c2
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/kprobes.c68
-rw-r--r--kernel/kthread.c23
-rw-r--r--kernel/lockdep.c2
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/params.c17
-rw-r--r--kernel/perf_event.c695
-rw-r--r--kernel/power/hibernate.c11
-rw-r--r--kernel/power/suspend_test.c5
-rw-r--r--kernel/power/swap.c43
-rw-r--r--kernel/rcutree.c60
-rw-r--r--kernel/rcutree.h17
-rw-r--r--kernel/rcutree_plugin.h46
-rw-r--r--kernel/sched.c65
-rw-r--r--kernel/sched_fair.c74
-rw-r--r--kernel/signal.c27
-rw-r--r--kernel/sys.c25
-rw-r--r--kernel/sysctl_check.c2
-rw-r--r--kernel/trace/Kconfig38
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/ftrace.c378
-rw-r--r--kernel/trace/ring_buffer.c14
-rw-r--r--kernel/trace/trace.c12
-rw-r--r--kernel/trace/trace.h80
-rw-r--r--kernel/trace/trace_entries.h16
-rw-r--r--kernel/trace/trace_event_profile.c43
-rw-r--r--kernel/trace/trace_events.c167
-rw-r--r--kernel/trace/trace_events_filter.c423
-rw-r--r--kernel/trace/trace_export.c14
-rw-r--r--kernel/trace/trace_kprobe.c1523
-rw-r--r--kernel/trace/trace_ksym.c550
-rw-r--r--kernel/trace/trace_output.c5
-rw-r--r--kernel/trace/trace_selftest.c55
-rw-r--r--kernel/trace/trace_syscalls.c209
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/workqueue.c21
43 files changed, 4338 insertions, 863 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index b8d4cd8ac0b..6b7ce8173df 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_sched_clock.o = -pg
+CFLAGS_REMOVE_perf_event.o = -pg
endif
obj-$(CONFIG_FREEZER) += freezer.o
@@ -95,6 +96,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_SLOW_WORK) += slow-work.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ca83b73fba1..0249f4be9b5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1710,14 +1710,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
return -EFAULT;
buffer[nbytes] = 0; /* nul-terminate */
- strstrip(buffer);
if (cft->write_u64) {
- u64 val = simple_strtoull(buffer, &end, 0);
+ u64 val = simple_strtoull(strstrip(buffer), &end, 0);
if (*end)
return -EINVAL;
retval = cft->write_u64(cgrp, cft, val);
} else {
- s64 val = simple_strtoll(buffer, &end, 0);
+ s64 val = simple_strtoll(strstrip(buffer), &end, 0);
if (*end)
return -EINVAL;
retval = cft->write_s64(cgrp, cft, val);
@@ -1753,8 +1752,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
}
buffer[nbytes] = 0; /* nul-terminate */
- strstrip(buffer);
- retval = cft->write_string(cgrp, cft, buffer);
+ retval = cft->write_string(cgrp, cft, strstrip(buffer));
if (!retval)
retval = nbytes;
out:
diff --git a/kernel/exit.c b/kernel/exit.c
index e61891f8012..3f45e3cf931 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,6 +49,7 @@
#include <linux/init_task.h>
#include <linux/perf_event.h>
#include <trace/events/sched.h>
+#include <linux/hw_breakpoint.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -359,10 +360,8 @@ void __set_special_pids(struct pid *pid)
{
struct task_struct *curr = current->group_leader;
- if (task_session(curr) != pid) {
+ if (task_session(curr) != pid)
change_pid(curr, PIDTYPE_SID, pid);
- proc_sid_connector(curr);
- }
if (task_pgrp(curr) != pid)
change_pid(curr, PIDTYPE_PGID, pid);
@@ -980,6 +979,10 @@ NORET_TYPE void do_exit(long code)
proc_exit_connector(tsk);
/*
+ * FIXME: do that only when needed, using sched_exit tracepoint
+ */
+ flush_ptrace_hw_breakpoint(tsk);
+ /*
* Flush inherited counters to the parent - before the parent
* gets woken up by child-exit notifications.
*/
diff --git a/kernel/fork.c b/kernel/fork.c
index 4c20fff8c13..166b8c49257 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -91,7 +91,7 @@ int nr_processes(void)
int cpu;
int total = 0;
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
total += per_cpu(process_counts, cpu);
return total;
diff --git a/kernel/futex.c b/kernel/futex.c
index 4949d336d88..fb65e822fc4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
*/
static inline int match_futex(union futex_key *key1, union futex_key *key2)
{
- return (key1->both.word == key2->both.word
+ return (key1 && key2
+ && key1->both.word == key2->both.word
&& key1->both.ptr == key2->both.ptr
&& key1->both.offset == key2->both.offset);
}
@@ -1028,7 +1029,6 @@ static inline
void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
struct futex_hash_bucket *hb)
{
- drop_futex_key_refs(&q->key);
get_futex_key_refs(key);
q->key = *key;
@@ -1226,6 +1226,7 @@ retry_private:
*/
if (ret == 1) {
WARN_ON(pi_state);
+ drop_count++;
task_count++;
ret = get_futex_value_locked(&curval2, uaddr2);
if (!ret)
@@ -1304,6 +1305,7 @@ retry_private:
if (ret == 1) {
/* We got the lock. */
requeue_pi_wake_futex(this, &key2, hb2);
+ drop_count++;
continue;
} else if (ret) {
/* -EDEADLK */
@@ -1791,6 +1793,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
current->timer_slack_ns);
}
+retry:
/* Prepare to wait on uaddr. */
ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
if (ret)
@@ -1808,9 +1811,14 @@ static int futex_wait(u32 __user *uaddr, int fshared,
goto out_put_key;
/*
- * We expect signal_pending(current), but another thread may
- * have handled it for us already.
+ * We expect signal_pending(current), but we might be the
+ * victim of a spurious wakeup as well.
*/
+ if (!signal_pending(current)) {
+ put_futex_key(fshared, &q.key);
+ goto retry;
+ }
+
ret = -ERESTARTSYS;
if (!abs_time)
goto out_put_key;
@@ -2118,9 +2126,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
*/
plist_del(&q->list, &q->list.plist);
+ /* Handle spurious wakeups gracefully */
+ ret = -EWOULDBLOCK;
if (timeout && !timeout->task)
ret = -ETIMEDOUT;
- else
+ else if (signal_pending(current))
ret = -ERESTARTNOINTR;
}
return ret;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
new file mode 100644
index 00000000000..cf5ee162841
--- /dev/null
+++ b/kernel/hw_breakpoint.c
@@ -0,0 +1,423 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) IBM Corporation, 2009
+ * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Thanks to Ingo Molnar for his many suggestions.
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ * K.Prasad <prasad@linux.vnet.ibm.com>
+ * Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ * This file contains the arch-independent routines.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <linux/hw_breakpoint.h>
+
+/*
+ * Constraints data
+ */
+
+/* Number of pinned cpu breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
+
+/* Number of pinned task breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]);
+
+/* Number of non-pinned cpu/task breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
+
+/* Gather the number of total pinned and un-pinned bp in a cpuset */
+struct bp_busy_slots {
+ unsigned int pinned;
+ unsigned int flexible;
+};
+
+/* Serialize accesses to the above constraints */
+static DEFINE_MUTEX(nr_bp_mutex);
+
+/*
+ * Report the maximum number of pinned breakpoints a task
+ * have in this cpu
+ */
+static unsigned int max_task_bp_pinned(int cpu)
+{
+ int i;
+ unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu);
+
+ for (i = HBP_NUM -1; i >= 0; i--) {
+ if (tsk_pinned[i] > 0)
+ return i + 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Report the number of pinned/un-pinned breakpoints we have in
+ * a given cpu (cpu > -1) or in all of them (cpu = -1).
+ */
+static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
+{
+ if (cpu >= 0) {
+ slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
+ slots->pinned += max_task_bp_pinned(cpu);
+ slots->flexible = per_cpu(nr_bp_flexible, cpu);
+
+ return;
+ }
+
+ for_each_online_cpu(cpu) {
+ unsigned int nr;
+
+ nr = per_cpu(nr_cpu_bp_pinned, cpu);
+ nr += max_task_bp_pinned(cpu);
+
+ if (nr > slots->pinned)
+ slots->pinned = nr;
+
+ nr = per_cpu(nr_bp_flexible, cpu);
+
+ if (nr > slots->flexible)
+ slots->flexible = nr;
+ }
+}
+
+/*
+ * Add a pinned breakpoint for the given task in our constraint table
+ */
+static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
+{
+ int count = 0;
+ struct perf_event *bp;
+ struct perf_event_context *ctx = tsk->perf_event_ctxp;
+ unsigned int *tsk_pinned;
+ struct list_head *list;
+ unsigned long flags;
+
+ if (WARN_ONCE(!ctx, "No perf context for this task"))
+ return;
+
+ list = &ctx->event_list;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+
+ /*
+ * The current breakpoint counter is not included in the list
+ * at the open() callback time
+ */
+ list_for_each_entry(bp, list, event_entry) {
+ if (bp->attr.type == PERF_TYPE_BREAKPOINT)
+ count++;
+ }
+
+ spin_unlock_irqrestore(&ctx->lock, flags);
+
+ if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list"))
+ return;
+
+ tsk_pinned = per_cpu(task_bp_pinned, cpu);
+ if (enable) {
+ tsk_pinned[count]++;
+ if (count > 0)
+ tsk_pinned[count-1]--;
+ } else {
+ tsk_pinned[count]--;
+ if (count > 0)
+ tsk_pinned[count-1]++;
+ }
+}
+
+/*
+ * Add/remove the given breakpoint in our constraint table
+ */
+static void toggle_bp_slot(struct perf_event *bp, bool enable)
+{
+ int cpu = bp->cpu;
+ struct task_struct *tsk = bp->ctx->task;
+
+ /* Pinned counter task profiling */
+ if (tsk) {
+ if (cpu >= 0) {
+ toggle_bp_task_slot(tsk, cpu, enable);
+ return;
+ }
+
+ for_each_online_cpu(cpu)
+ toggle_bp_task_slot(tsk, cpu, enable);
+ return;
+ }
+
+ /* Pinned counter cpu profiling */
+ if (enable)
+ per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
+ else
+ per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
+}
+
+/*
+ * Contraints to check before allowing this new breakpoint counter:
+ *
+ * == Non-pinned counter == (Considered as pinned for now)
+ *
+ * - If attached to a single cpu, check:
+ *
+ * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
+ * + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM
+ *
+ * -> If there are already non-pinned counters in this cpu, it means
+ * there is already a free slot for them.
+ * Otherwise, we check that the maximum number of per task
+ * breakpoints (for this cpu) plus the number of per cpu breakpoint
+ * (for this cpu) doesn't cover every registers.
+ *
+ * - If attached to every cpus, check:
+ *
+ * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
+ * + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM
+ *
+ * -> This is roughly the same, except we check the number of per cpu
+ * bp for every cpu and we keep the max one. Same for the per tasks
+ * breakpoints.
+ *
+ *
+ * == Pinned counter ==
+ *
+ * - If attached to a single cpu, check:
+ *
+ * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
+ * + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM
+ *
+ * -> Same checks as before. But now the nr_bp_flexible, if any, must keep
+ * one register at least (or they will never be fed).
+ *
+ * - If attached to every cpus, check:
+ *
+ * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
+ * + max(per_cpu(task_bp_pinned, *))) < HBP_NUM
+ */
+int reserve_bp_slot(struct perf_event *bp)
+{
+ struct bp_busy_slots slots = {0};
+ int ret = 0;
+
+ mutex_lock(&nr_bp_mutex);
+
+ fetch_bp_busy_slots(&slots, bp->cpu);
+
+ /* Flexible counters need to keep at least one slot */
+ if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
+ ret = -ENOSPC;
+ goto end;
+ }
+
+ toggle_bp_slot(bp, true);
+
+end:
+ mutex_unlock(&nr_bp_mutex);
+
+ return ret;
+}
+
+void release_bp_slot(struct perf_event *bp)
+{
+ mutex_lock(&nr_bp_mutex);
+
+ toggle_bp_slot(bp, false);
+
+ mutex_unlock(&nr_bp_mutex);
+}
+
+
+int __register_perf_hw_breakpoint(struct perf_event *bp)
+{
+ int ret;
+
+ ret = reserve_bp_slot(bp);
+ if (ret)
+ return ret;
+
+ /*
+ * Ptrace breakpoints can be temporary perf events only
+ * meant to reserve a slot. In this case, it is created disabled and
+ * we don't want to check the params right now (as we put a null addr)
+ * But perf tools create events as disabled and we want to check
+ * the params for them.
+ * This is a quick hack that will be removed soon, once we remove
+ * the tmp breakpoints from ptrace
+ */
+ if (!bp->attr.disabled || bp->callback == perf_bp_event)
+ ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+
+ return ret;
+}
+
+int register_perf_hw_breakpoint(struct perf_event *bp)
+{
+ bp->callback = perf_bp_event;
+
+ return __register_perf_hw_breakpoint(bp);
+}
+
+/**
+ * register_user_hw_breakpoint - register a hardware breakpoint for user space
+ * @attr: breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ */
+struct perf_event *
+register_user_hw_breakpoint(struct perf_event_attr *attr,
+ perf_callback_t triggered,
+ struct task_struct *tsk)
+{
+ return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
+}
+EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
+
+/**
+ * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
+ * @bp: the breakpoint structure to modify
+ * @attr: new breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ */
+struct perf_event *
+modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr,
+ perf_callback_t triggered,
+ struct task_struct *tsk)
+{
+ /*
+ * FIXME: do it without unregistering
+ * - We don't want to lose our slot
+ * - If the new bp is incorrect, don't lose the older one
+ */
+ unregister_hw_breakpoint(bp);
+
+ return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
+}
+EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
+
+/**
+ * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
+ * @bp: the breakpoint structure to unregister
+ */
+void unregister_hw_breakpoint(struct perf_event *bp)
+{
+ if (!bp)
+ return;
+ perf_event_release_kernel(bp);
+}
+EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
+
+/**
+ * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
+ * @attr: breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ *
+ * @return a set of per_cpu pointers to perf events
+ */
+struct perf_event **
+register_wide_hw_breakpoint(struct perf_event_attr *attr,
+ perf_callback_t triggered)
+{
+ struct perf_event **cpu_events, **pevent, *bp;
+ long err;
+ int cpu;
+
+ cpu_events = alloc_percpu(typeof(*cpu_events));
+ if (!cpu_events)
+ return ERR_PTR(-ENOMEM);
+
+ for_each_possible_cpu(cpu) {
+ pevent = per_cpu_ptr(cpu_events, cpu);
+ bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
+
+ *pevent = bp;
+
+ if (IS_ERR(bp)) {
+ err = PTR_ERR(bp);
+ goto fail;
+ }
+ }
+
+ return cpu_events;
+
+fail:
+ for_each_possible_cpu(cpu) {
+ pevent = per_cpu_ptr(cpu_events, cpu);
+ if (IS_ERR(*pevent))
+ break;
+ unregister_hw_breakpoint(*pevent);
+ }
+ free_percpu(cpu_events);
+ /* return the error if any */
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
+
+/**
+ * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
+ * @cpu_events: the per cpu set of events to unregister
+ */
+void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
+{
+ int cpu;
+ struct perf_event **pevent;
+
+ for_each_possible_cpu(cpu) {
+ pevent = per_cpu_ptr(cpu_events, cpu);
+ unregister_hw_breakpoint(*pevent);
+ }
+ free_percpu(cpu_events);
+}
+EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
+
+static struct notifier_block hw_breakpoint_exceptions_nb = {
+ .notifier_call = hw_breakpoint_exceptions_notify,
+ /* we need to be notified first */
+ .priority = 0x7fffffff
+};
+
+static int __init init_hw_breakpoint(void)
+{
+ return register_die_notifier(&hw_breakpoint_exceptions_nb);
+}
+core_initcall(init_hw_breakpoint);
+
+
+struct pmu perf_ops_bp = {
+ .enable = arch_install_hw_breakpoint,
+ .disable = arch_uninstall_hw_breakpoint,
+ .read = hw_breakpoint_pmu_read,
+ .unthrottle = hw_breakpoint_pmu_unthrottle
+};
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 114e704760f..bd7273e6282 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -121,7 +121,9 @@ static void poll_all_shared_irqs(void)
if (!(status & IRQ_SPURIOUS_DISABLED))
continue;
+ local_irq_disable();
try_one_irq(i, desc);
+ local_irq_enable();
}
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8b6b8b697c6..8e5288a8a35 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name)
}
return module_kallsyms_lookup_name(name);
}
+EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
unsigned long),
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5240d75f4c6..84495958e70 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
*/
static struct kprobe_blackpoint kprobe_blacklist[] = {
{"preempt_schedule",},
+ {"native_get_debugreg",},
+ {"irq_entries_start",},
+ {"common_interrupt",},
{NULL} /* Terminator */
};
@@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
return (kprobe_opcode_t *)(((char *)addr) + p->offset);
}
+/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
+{
+ struct kprobe *old_p, *list_p;
+
+ old_p = get_kprobe(p->addr);
+ if (unlikely(!old_p))
+ return NULL;
+
+ if (p != old_p) {
+ list_for_each_entry_rcu(list_p, &old_p->list, list)
+ if (list_p == p)
+ /* kprobe p is a valid probe */
+ goto valid;
+ return NULL;
+ }
+valid:
+ return old_p;
+}
+
+/* Return error if the kprobe is being re-registered */
+static inline int check_kprobe_rereg(struct kprobe *p)
+{
+ int ret = 0;
+ struct kprobe *old_p;
+
+ mutex_lock(&kprobe_mutex);
+ old_p = __get_valid_kprobe(p);
+ if (old_p)
+ ret = -EINVAL;
+ mutex_unlock(&kprobe_mutex);
+ return ret;
+}
+
int __kprobes register_kprobe(struct kprobe *p)
{
int ret = 0;
@@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p)
return -EINVAL;
p->addr = addr;
+ ret = check_kprobe_rereg(p);
+ if (ret)
+ return ret;
+
preempt_disable();
if (!kernel_text_address((unsigned long) p->addr) ||
in_kprobes_functions((unsigned long) p->addr)) {
@@ -754,26 +795,6 @@ out:
}
EXPORT_SYMBOL_GPL(register_kprobe);
-/* Check passed kprobe is valid and return kprobe in kprobe_table. */
-static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
-{
- struct kprobe *old_p, *list_p;
-
- old_p = get_kprobe(p->addr);
- if (unlikely(!old_p))
- return NULL;
-
- if (p != old_p) {
- list_for_each_entry_rcu(list_p, &old_p->list, list)
- if (list_p == p)
- /* kprobe p is a valid probe */
- goto valid;
- return NULL;
- }
-valid:
- return old_p;
-}
-
/*
* Unregister a kprobe without a scheduler synchronization.
*/
@@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
arch_remove_kprobe(p);
}
+void __kprobes dump_kprobe(struct kprobe *kp)
+{
+ printk(KERN_WARNING "Dumping kprobe:\n");
+ printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
+ kp->symbol_name, kp->addr, kp->offset);
+}
+
/* Module notifier call back, checking kprobes on the module */
static int __kprobes kprobes_module_callback(struct notifier_block *nb,
unsigned long val, void *data)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5fe709982ca..ab7ae57773e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -150,29 +150,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
EXPORT_SYMBOL(kthread_create);
/**
- * kthread_bind - bind a just-created kthread to a cpu.
- * @k: thread created by kthread_create().
- * @cpu: cpu (might not be online, must be possible) for @k to run on.
- *
- * Description: This function is equivalent to set_cpus_allowed(),
- * except that @cpu doesn't need to be online, and the thread must be
- * stopped (i.e., just returned from kthread_create()).
- */
-void kthread_bind(struct task_struct *k, unsigned int cpu)
-{
- /* Must have done schedule() in kthread() before we set_task_cpu */
- if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) {
- WARN_ON(1);
- return;
- }
- set_task_cpu(k, cpu);
- k->cpus_allowed = cpumask_of_cpu(cpu);
- k->rt.nr_cpus_allowed = 1;
- k->flags |= PF_THREAD_BOUND;
-}
-EXPORT_SYMBOL(kthread_bind);
-
-/**
* kthread_stop - stop a thread created by kthread_create().
* @k: thread created by kthread_create().
*
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9af56723c09..f5dcd36d315 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -49,7 +49,7 @@
#include "lockdep_internals.h"
#define CREATE_TRACE_POINTS
-#include <trace/events/lockdep.h>
+#include <trace/events/lock.h>
#ifdef CONFIG_PROVE_LOCKING
int prove_locking = 1;
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 61d5aa5eced..acd24e7643e 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
static ATOMIC_NOTIFIER_HEAD(die_chain);
-int notrace notify_die(enum die_val val, const char *str,
+int notrace __kprobes notify_die(enum die_val val, const char *str,
struct pt_regs *regs, long err, int trap, int sig)
{
struct die_args args = {
diff --git a/kernel/params.c b/kernel/params.c
index 9da58eabdcb..d656c276508 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -218,15 +218,11 @@ int param_set_charp(const char *val, struct kernel_param *kp)
return -ENOSPC;
}
- if (kp->flags & KPARAM_KMALLOCED)
- kfree(*(char **)kp->arg);
-
/* This is a hack. We can't need to strdup in early boot, and we
* don't need to; this mangled commandline is preserved. */
if (slab_is_available()) {
- kp->flags |= KPARAM_KMALLOCED;
*(char **)kp->arg = kstrdup(val, GFP_KERNEL);
- if (!kp->arg)
+ if (!*(char **)kp->arg)
return -ENOMEM;
} else
*(const char **)kp->arg = val;
@@ -304,6 +300,7 @@ static int param_array(const char *name,
unsigned int min, unsigned int max,
void *elem, int elemsize,
int (*set)(const char *, struct kernel_param *kp),
+ u16 flags,
unsigned int *num)
{
int ret;
@@ -313,6 +310,7 @@ static int param_array(const char *name,
/* Get the name right for errors. */
kp.name = name;
kp.arg = elem;
+ kp.flags = flags;
/* No equals sign? */
if (!val) {
@@ -358,7 +356,8 @@ int param_array_set(const char *val, struct kernel_param *kp)
unsigned int temp_num;
return param_array(kp->name, val, 1, arr->max, arr->elem,
- arr->elemsize, arr->set, arr->num ?: &temp_num);
+ arr->elemsize, arr->set, kp->flags,
+ arr->num ?: &temp_num);
}
int param_array_get(char *buffer, struct kernel_param *kp)
@@ -605,11 +604,7 @@ void module_param_sysfs_remove(struct module *mod)
void destroy_params(const struct kernel_param *params, unsigned num)
{
- unsigned int i;
-
- for (i = 0; i < num; i++)
- if (params[i].flags & KPARAM_KMALLOCED)
- kfree(*(char **)params[i].arg);
+ /* FIXME: This should free kmalloced charp parameters. It doesn't. */
}
static void __init kernel_add_sysfs_param(const char *name,
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 9d0b5c66588..6b7ddba1dd6 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -28,6 +28,8 @@
#include <linux/anon_inodes.h>
#include <linux/kernel_stat.h>
#include <linux/perf_event.h>
+#include <linux/ftrace_event.h>
+#include <linux/hw_breakpoint.h>
#include <asm/irq_regs.h>
@@ -244,6 +246,49 @@ static void perf_unpin_context(struct perf_event_context *ctx)
put_ctx(ctx);
}
+static inline u64 perf_clock(void)
+{
+ return cpu_clock(smp_processor_id());
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_event_context *ctx)
+{
+ u64 now = perf_clock();
+
+ ctx->time += now - ctx->timestamp;
+ ctx->timestamp = now;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a event.
+ */
+static void update_event_times(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ u64 run_end;
+
+ if (event->state < PERF_EVENT_STATE_INACTIVE ||
+ event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
+ return;
+
+ if (ctx->is_active)
+ run_end = ctx->time;
+ else
+ run_end = event->tstamp_stopped;
+
+ event->total_time_enabled = run_end - event->tstamp_enabled;
+
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ run_end = event->tstamp_stopped;
+ else
+ run_end = ctx->time;
+
+ event->total_time_running = run_end - event->tstamp_running;
+}
+
/*
* Add a event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
@@ -292,6 +337,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
if (event->group_leader != event)
event->group_leader->nr_siblings--;
+ update_event_times(event);
+
+ /*
+ * If event was in error state, then keep it
+ * that way, otherwise bogus counts will be
+ * returned on read(). The only way to get out
+ * of error state is by explicit re-enabling
+ * of the event
+ */
+ if (event->state > PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_OFF;
+
/*
* If this was a group event with sibling events then
* upgrade the siblings to singleton events by adding them
@@ -445,50 +502,11 @@ retry:
* can remove the event safely, if the call above did not
* succeed.
*/
- if (!list_empty(&event->group_entry)) {
+ if (!list_empty(&event->group_entry))
list_del_event(event, ctx);
- }
spin_unlock_irq(&ctx->lock);
}
-static inline u64 perf_clock(void)
-{
- return cpu_clock(smp_processor_id());
-}
-
-/*
- * Update the record of the current time in a context.
- */
-static void update_context_time(struct perf_event_context *ctx)
-{
- u64 now = perf_clock();
-
- ctx->time += now - ctx->timestamp;
- ctx->timestamp = now;
-}
-
-/*
- * Update the total_time_enabled and total_time_running fields for a event.
- */
-static void update_event_times(struct perf_event *event)
-{
- struct perf_event_context *ctx = event->ctx;
- u64 run_end;
-
- if (event->state < PERF_EVENT_STATE_INACTIVE ||
- event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
- return;
-
- event->total_time_enabled = ctx->time - event->tstamp_enabled;
-
- if (event->state == PERF_EVENT_STATE_INACTIVE)
- run_end = event->tstamp_stopped;
- else
- run_end = ctx->time;
-
- event->total_time_running = run_end - event->tstamp_running;
-}
-
/*
* Update total_time_enabled and total_time_running for all events in a group.
*/
@@ -1031,10 +1049,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
update_context_time(ctx);
perf_disable();
- if (ctx->nr_active)
+ if (ctx->nr_active) {
list_for_each_entry(event, &ctx->group_list, group_entry)
group_sched_out(event, cpuctx, ctx);
-
+ }
perf_enable();
out:
spin_unlock(&ctx->lock);
@@ -1059,8 +1077,6 @@ static int context_equiv(struct perf_event_context *ctx1,
&& !ctx1->pin_count && !ctx2->pin_count;
}
-static void __perf_event_read(void *event);
-
static void __perf_event_sync_stat(struct perf_event *event,
struct perf_event *next_event)
{
@@ -1078,8 +1094,8 @@ static void __perf_event_sync_stat(struct perf_event *event,
*/
switch (event->state) {
case PERF_EVENT_STATE_ACTIVE:
- __perf_event_read(event);
- break;
+ event->pmu->read(event);
+ /* fall-through */
case PERF_EVENT_STATE_INACTIVE:
update_event_times(event);
@@ -1118,6 +1134,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
if (!ctx->nr_stat)
return;
+ update_context_time(ctx);
+
event = list_first_entry(&ctx->event_list,
struct perf_event, event_entry);
@@ -1161,8 +1179,6 @@ void perf_event_task_sched_out(struct task_struct *task,
if (likely(!ctx || !cpuctx->task_ctx))
return;
- update_context_time(ctx);
-
rcu_read_lock();
parent = rcu_dereference(ctx->parent_ctx);
next_ctx = next->perf_event_ctxp;
@@ -1355,7 +1371,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
u64 interrupts, freq;
spin_lock(&ctx->lock);
- list_for_each_entry(event, &ctx->group_list, group_entry) {
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
@@ -1515,7 +1531,6 @@ static void __perf_event_read(void *info)
struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
- unsigned long flags;
/*
* If this is a task context, we need to check whether it is
@@ -1527,12 +1542,12 @@ static void __perf_event_read(void *info)
if (ctx->task && cpuctx->task_ctx != ctx)
return;
- local_irq_save(flags);
- if (ctx->is_active)
- update_context_time(ctx);
- event->pmu->read(event);
+ spin_lock(&ctx->lock);
+ update_context_time(ctx);
update_event_times(event);
- local_irq_restore(flags);
+ spin_unlock(&ctx->lock);
+
+ event->pmu->read(event);
}
static u64 perf_event_read(struct perf_event *event)
@@ -1545,7 +1560,13 @@ static u64 perf_event_read(struct perf_event *event)
smp_call_function_single(event->oncpu,
__perf_event_read, event, 1);
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ struct perf_event_context *ctx = event->ctx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+ update_context_time(ctx);
update_event_times(event);
+ spin_unlock_irqrestore(&ctx->lock, flags);
}
return atomic64_read(&event->count);
@@ -1658,6 +1679,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
return ERR_PTR(err);
}
+static void perf_event_free_filter(struct perf_event *event);
+
static void free_event_rcu(struct rcu_head *head)
{
struct perf_event *event;
@@ -1665,6 +1688,7 @@ static void free_event_rcu(struct rcu_head *head)
event = container_of(head, struct perf_event, rcu_head);
if (event->ns)
put_pid_ns(event->ns);
+ perf_event_free_filter(event);
kfree(event);
}
@@ -1696,16 +1720,10 @@ static void free_event(struct perf_event *event)
call_rcu(&event->rcu_head, free_event_rcu);
}
-/*
- * Called when the last reference to the file is gone.
- */
-static int perf_release(struct inode *inode, struct file *file)
+int perf_event_release_kernel(struct perf_event *event)
{
- struct perf_event *event = file->private_data;
struct perf_event_context *ctx = event->ctx;
- file->private_data = NULL;
-
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
perf_event_remove_from_context(event);
@@ -1720,6 +1738,19 @@ static int perf_release(struct inode *inode, struct file *file)
return 0;
}
+EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
+{
+ struct perf_event *event = file->private_data;
+
+ file->private_data = NULL;
+
+ return perf_event_release_kernel(event);
+}
static int perf_event_read_size(struct perf_event *event)
{
@@ -1746,91 +1777,94 @@ static int perf_event_read_size(struct perf_event *event)
return size;
}
-static u64 perf_event_read_value(struct perf_event *event)
+u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
u64 total = 0;
+ *enabled = 0;
+ *running = 0;
+
+ mutex_lock(&event->child_mutex);
total += perf_event_read(event);
- list_for_each_entry(child, &event->child_list, child_list)
+ *enabled += event->total_time_enabled +
+ atomic64_read(&event->child_total_time_enabled);
+ *running += event->total_time_running +
+ atomic64_read(&event->child_total_time_running);
+
+ list_for_each_entry(child, &event->child_list, child_list) {
total += perf_event_read(child);
+ *enabled += child->total_time_enabled;
+ *running += child->total_time_running;
+ }
+ mutex_unlock(&event->child_mutex);
return total;
}
-
-static int perf_event_read_entry(struct perf_event *event,
- u64 read_format, char __user *buf)
-{
- int n = 0, count = 0;
- u64 values[2];
-
- values[n++] = perf_event_read_value(event);
- if (read_format & PERF_FORMAT_ID)
- values[n++] = primary_event_id(event);
-
- count = n * sizeof(u64);
-
- if (copy_to_user(buf, values, count))
- return -EFAULT;
-
- return count;
-}
+EXPORT_SYMBOL_GPL(perf_event_read_value);
static int perf_event_read_group(struct perf_event *event,
u64 read_format, char __user *buf)
{
struct perf_event *leader = event->group_leader, *sub;
- int n = 0, size = 0, err = -EFAULT;
- u64 values[3];
+ int n = 0, size = 0, ret = -EFAULT;
+ struct perf_event_context *ctx = leader->ctx;
+ u64 values[5];
+ u64 count, enabled, running;
+
+ mutex_lock(&ctx->mutex);
+ count = perf_event_read_value(leader, &enabled, &running);
values[n++] = 1 + leader->nr_siblings;
- if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
- values[n++] = leader->total_time_enabled +
- atomic64_read(&leader->child_total_time_enabled);
- }
- if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
- values[n++] = leader->total_time_running +
- atomic64_read(&leader->child_total_time_running);
- }
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ values[n++] = enabled;
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ values[n++] = running;
+ values[n++] = count;
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(leader);
size = n * sizeof(u64);
if (copy_to_user(buf, values, size))
- return -EFAULT;
-
- err = perf_event_read_entry(leader, read_format, buf + size);
- if (err < 0)
- return err;
+ goto unlock;
- size += err;
+ ret = size;
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
- err = perf_event_read_entry(sub, read_format,
- buf + size);
- if (err < 0)
- return err;
+ n = 0;
+
+ values[n++] = perf_event_read_value(sub, &enabled, &running);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(sub);
+
+ size = n * sizeof(u64);
+
+ if (copy_to_user(buf + ret, values, size)) {
+ ret = -EFAULT;
+ goto unlock;
+ }
- size += err;
+ ret += size;
}
+unlock:
+ mutex_unlock(&ctx->mutex);
- return size;
+ return ret;
}
static int perf_event_read_one(struct perf_event *event,
u64 read_format, char __user *buf)
{
+ u64 enabled, running;
u64 values[4];
int n = 0;
- values[n++] = perf_event_read_value(event);
- if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
- values[n++] = event->total_time_enabled +
- atomic64_read(&event->child_total_time_enabled);
- }
- if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
- values[n++] = event->total_time_running +
- atomic64_read(&event->child_total_time_running);
- }
+ values[n++] = perf_event_read_value(event, &enabled, &running);
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ values[n++] = enabled;
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ values[n++] = running;
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
@@ -1861,12 +1895,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
return -ENOSPC;
WARN_ON_ONCE(event->ctx->parent_ctx);
- mutex_lock(&event->child_mutex);
if (read_format & PERF_FORMAT_GROUP)
ret = perf_event_read_group(event, read_format, buf);
else
ret = perf_event_read_one(event, read_format, buf);
- mutex_unlock(&event->child_mutex);
return ret;
}
@@ -1974,7 +2006,8 @@ unlock:
return ret;
}
-int perf_event_set_output(struct perf_event *event, int output_fd);
+static int perf_event_set_output(struct perf_event *event, int output_fd);
+static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
@@ -2002,6 +2035,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case PERF_EVENT_IOC_SET_OUTPUT:
return perf_event_set_output(event, arg);
+ case PERF_EVENT_IOC_SET_FILTER:
+ return perf_event_set_filter(event, (void __user *)arg);
+
default:
return -ENOTTY;
}
@@ -2174,6 +2210,7 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
perf_mmap_free_page((unsigned long)data->user_page);
for (i = 0; i < data->nr_pages; i++)
perf_mmap_free_page((unsigned long)data->data_pages[i]);
+ kfree(data);
}
#else
@@ -2214,6 +2251,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
perf_mmap_unmark_page(base + (i * PAGE_SIZE));
vfree(base);
+ kfree(data);
}
static void perf_mmap_data_free(struct perf_mmap_data *data)
@@ -2307,7 +2345,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
}
if (!data->watermark)
- data->watermark = max_t(long, PAGE_SIZE, max_size / 2);
+ data->watermark = max_size / 2;
rcu_assign_pointer(event->data, data);
@@ -2319,7 +2357,6 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
perf_mmap_data_free(data);
- kfree(data);
}
static void perf_mmap_data_release(struct perf_event *event)
@@ -2666,20 +2703,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
static void perf_output_lock(struct perf_output_handle *handle)
{
struct perf_mmap_data *data = handle->data;
- int cpu;
+ int cur, cpu = get_cpu();
handle->locked = 0;
- local_irq_save(handle->flags);
- cpu = smp_processor_id();
-
- if (in_nmi() && atomic_read(&data->lock) == cpu)
- return;
+ for (;;) {
+ cur = atomic_cmpxchg(&data->lock, -1, cpu);
+ if (cur == -1) {
+ handle->locked = 1;
+ break;
+ }
+ if (cur == cpu)
+ break;
- while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
cpu_relax();
-
- handle->locked = 1;
+ }
}
static void perf_output_unlock(struct perf_output_handle *handle)
@@ -2725,7 +2763,7 @@ again:
if (atomic_xchg(&data->wakeup, 0))
perf_output_wakeup(handle);
out:
- local_irq_restore(handle->flags);
+ put_cpu();
}
void perf_output_copy(struct perf_output_handle *handle,
@@ -3236,15 +3274,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
{
struct perf_event *event;
- if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
- return;
-
- rcu_read_lock();
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (perf_event_task_match(event))
perf_event_task_output(event, task_event);
}
- rcu_read_unlock();
}
static void perf_event_task_event(struct perf_task_event *task_event)
@@ -3252,11 +3285,11 @@ static void perf_event_task_event(struct perf_task_event *task_event)
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx = task_event->task_ctx;
+ rcu_read_lock();
cpuctx = &get_cpu_var(perf_cpu_context);
perf_event_task_ctx(&cpuctx->ctx, task_event);
put_cpu_var(perf_cpu_context);
- rcu_read_lock();
if (!ctx)
ctx = rcu_dereference(task_event->task->perf_event_ctxp);
if (ctx)
@@ -3348,15 +3381,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx,
{
struct perf_event *event;
- if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
- return;
-
- rcu_read_lock();
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (perf_event_comm_match(event))
perf_event_comm_output(event, comm_event);
}
- rcu_read_unlock();
}
static void perf_event_comm_event(struct perf_comm_event *comm_event)
@@ -3367,7 +3395,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
char comm[TASK_COMM_LEN];
memset(comm, 0, sizeof(comm));
- strncpy(comm, comm_event->task->comm, sizeof(comm));
+ strlcpy(comm, comm_event->task->comm, sizeof(comm));
size = ALIGN(strlen(comm)+1, sizeof(u64));
comm_event->comm = comm;
@@ -3375,11 +3403,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
+ rcu_read_lock();
cpuctx = &get_cpu_var(perf_cpu_context);
perf_event_comm_ctx(&cpuctx->ctx, comm_event);
put_cpu_var(perf_cpu_context);
- rcu_read_lock();
/*
* doesn't really matter which of the child contexts the
* events ends up in.
@@ -3472,15 +3500,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx,
{
struct perf_event *event;
- if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
- return;
-
- rcu_read_lock();
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (perf_event_mmap_match(event, mmap_event))
perf_event_mmap_output(event, mmap_event);
}
- rcu_read_unlock();
}
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -3536,11 +3559,11 @@ got_name:
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
+ rcu_read_lock();
cpuctx = &get_cpu_var(perf_cpu_context);
perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
put_cpu_var(perf_cpu_context);
- rcu_read_lock();
/*
* doesn't really matter which of the child contexts the
* events ends up in.
@@ -3679,7 +3702,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
perf_event_disable(event);
}
- perf_event_output(event, nmi, data, regs);
+ if (event->overflow_handler)
+ event->overflow_handler(event, nmi, data, regs);
+ else
+ perf_event_output(event, nmi, data, regs);
+
return ret;
}
@@ -3724,16 +3751,16 @@ again:
return nr;
}
-static void perf_swevent_overflow(struct perf_event *event,
+static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
int nmi, struct perf_sample_data *data,
struct pt_regs *regs)
{
struct hw_perf_event *hwc = &event->hw;
int throttle = 0;
- u64 overflow;
data->period = event->hw.last_period;
- overflow = perf_swevent_set_period(event);
+ if (!overflow)
+ overflow = perf_swevent_set_period(event);
if (hwc->interrupts == MAX_INTERRUPTS)
return;
@@ -3766,14 +3793,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
atomic64_add(nr, &event->count);
+ if (!regs)
+ return;
+
if (!hwc->sample_period)
return;
- if (!regs)
+ if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
+ return perf_swevent_overflow(event, 1, nmi, data, regs);
+
+ if (atomic64_add_negative(nr, &hwc->period_left))
return;
- if (!atomic64_add_negative(nr, &hwc->period_left))
- perf_swevent_overflow(event, nmi, data, regs);
+ perf_swevent_overflow(event, 0, nmi, data, regs);
}
static int perf_swevent_is_counting(struct perf_event *event)
@@ -3806,25 +3838,44 @@ static int perf_swevent_is_counting(struct perf_event *event)
return 1;
}
+static int perf_tp_event_match(struct perf_event *event,
+ struct perf_sample_data *data);
+
+static int perf_exclude_event(struct perf_event *event,
+ struct pt_regs *regs)
+{
+ if (regs) {
+ if (event->attr.exclude_user && user_mode(regs))
+ return 1;
+
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return 1;
+ }
+
+ return 0;
+}
+
static int perf_swevent_match(struct perf_event *event,
enum perf_type_id type,
- u32 event_id, struct pt_regs *regs)
+ u32 event_id,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
if (!perf_swevent_is_counting(event))
return 0;
if (event->attr.type != type)
return 0;
+
if (event->attr.config != event_id)
return 0;
- if (regs) {
- if (event->attr.exclude_user && user_mode(regs))
- return 0;
+ if (perf_exclude_event(event, regs))
+ return 0;
- if (event->attr.exclude_kernel && !user_mode(regs))
- return 0;
- }
+ if (event->attr.type == PERF_TYPE_TRACEPOINT &&
+ !perf_tp_event_match(event, data))
+ return 0;
return 1;
}
@@ -3837,49 +3888,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
{
struct perf_event *event;
- if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
- return;
-
- rcu_read_lock();
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (perf_swevent_match(event, type, event_id, regs))
+ if (perf_swevent_match(event, type, event_id, data, regs))
perf_swevent_add(event, nr, nmi, data, regs);
}
- rcu_read_unlock();
}
-static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
+int perf_swevent_get_recursion_context(void)
{
+ struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+ int rctx;
+
if (in_nmi())
- return &cpuctx->recursion[3];
+ rctx = 3;
+ else if (in_irq())
+ rctx = 2;
+ else if (in_softirq())
+ rctx = 1;
+ else
+ rctx = 0;
- if (in_irq())
- return &cpuctx->recursion[2];
+ if (cpuctx->recursion[rctx]) {
+ put_cpu_var(perf_cpu_context);
+ return -1;
+ }
- if (in_softirq())
- return &cpuctx->recursion[1];
+ cpuctx->recursion[rctx]++;
+ barrier();
- return &cpuctx->recursion[0];
+ return rctx;
+}
+EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
+
+void perf_swevent_put_recursion_context(int rctx)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ barrier();
+ cpuctx->recursion[rctx]--;
+ put_cpu_var(perf_cpu_context);
}
+EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
u64 nr, int nmi,
struct perf_sample_data *data,
struct pt_regs *regs)
{
- struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
- int *recursion = perf_swevent_recursion_context(cpuctx);
+ struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- if (*recursion)
- goto out;
-
- (*recursion)++;
- barrier();
-
+ cpuctx = &__get_cpu_var(perf_cpu_context);
+ rcu_read_lock();
perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
nr, nmi, data, regs);
- rcu_read_lock();
/*
* doesn't really matter which of the child contexts the
* events ends up in.
@@ -3888,23 +3949,24 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
if (ctx)
perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
rcu_read_unlock();
-
- barrier();
- (*recursion)--;
-
-out:
- put_cpu_var(perf_cpu_context);
}
void __perf_sw_event(u32 event_id, u64 nr, int nmi,
struct pt_regs *regs, u64 addr)
{
- struct perf_sample_data data = {
- .addr = addr,
- };
+ struct perf_sample_data data;
+ int rctx;
- do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
- &data, regs);
+ rctx = perf_swevent_get_recursion_context();
+ if (rctx < 0)
+ return;
+
+ data.addr = addr;
+ data.raw = NULL;
+
+ do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
+
+ perf_swevent_put_recursion_context(rctx);
}
static void perf_swevent_read(struct perf_event *event)
@@ -3949,6 +4011,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
event->pmu->read(event);
data.addr = 0;
+ data.period = event->hw.last_period;
regs = get_irq_regs();
/*
* In case we exclude kernel IPs or are somehow not in interrupt
@@ -3959,8 +4022,9 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
regs = task_pt_regs(current);
if (regs) {
- if (perf_event_overflow(event, 0, &data, regs))
- ret = HRTIMER_NORESTART;
+ if (!(event->attr.exclude_idle && current->pid == 0))
+ if (perf_event_overflow(event, 0, &data, regs))
+ ret = HRTIMER_NORESTART;
}
period = max_t(u64, 10000, event->hw.sample_period);
@@ -3969,6 +4033,42 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
return ret;
}
+static void perf_swevent_start_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hwc->hrtimer.function = perf_swevent_hrtimer;
+ if (hwc->sample_period) {
+ u64 period;
+
+ if (hwc->remaining) {
+ if (hwc->remaining < 0)
+ period = 10000;
+ else
+ period = hwc->remaining;
+ hwc->remaining = 0;
+ } else {
+ period = max_t(u64, 10000, hwc->sample_period);
+ }
+ __hrtimer_start_range_ns(&hwc->hrtimer,
+ ns_to_ktime(period), 0,
+ HRTIMER_MODE_REL, 0);
+ }
+}
+
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->sample_period) {
+ ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+ hwc->remaining = ktime_to_ns(remaining);
+
+ hrtimer_cancel(&hwc->hrtimer);
+ }
+}
+
/*
* Software event: cpu wall time clock
*/
@@ -3991,22 +4091,14 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
int cpu = raw_smp_processor_id();
atomic64_set(&hwc->prev_count, cpu_clock(cpu));
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hwc->hrtimer.function = perf_swevent_hrtimer;
- if (hwc->sample_period) {
- u64 period = max_t(u64, 10000, hwc->sample_period);
- __hrtimer_start_range_ns(&hwc->hrtimer,
- ns_to_ktime(period), 0,
- HRTIMER_MODE_REL, 0);
- }
+ perf_swevent_start_hrtimer(event);
return 0;
}
static void cpu_clock_perf_event_disable(struct perf_event *event)
{
- if (event->hw.sample_period)
- hrtimer_cancel(&event->hw.hrtimer);
+ perf_swevent_cancel_hrtimer(event);
cpu_clock_perf_event_update(event);
}
@@ -4043,22 +4135,15 @@ static int task_clock_perf_event_enable(struct perf_event *event)
now = event->ctx->time;
atomic64_set(&hwc->prev_count, now);
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hwc->hrtimer.function = perf_swevent_hrtimer;
- if (hwc->sample_period) {
- u64 period = max_t(u64, 10000, hwc->sample_period);
- __hrtimer_start_range_ns(&hwc->hrtimer,
- ns_to_ktime(period), 0,
- HRTIMER_MODE_REL, 0);
- }
+
+ perf_swevent_start_hrtimer(event);
return 0;
}
static void task_clock_perf_event_disable(struct perf_event *event)
{
- if (event->hw.sample_period)
- hrtimer_cancel(&event->hw.hrtimer);
+ perf_swevent_cancel_hrtimer(event);
task_clock_perf_event_update(event, event->ctx->time);
}
@@ -4086,6 +4171,7 @@ static const struct pmu perf_ops_task_clock = {
};
#ifdef CONFIG_EVENT_PROFILE
+
void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
int entry_size)
{
@@ -4104,13 +4190,21 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
if (!regs)
regs = task_pt_regs(current);
+ /* Trace events already protected against recursion */
do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
&data, regs);
}
EXPORT_SYMBOL_GPL(perf_tp_event);
-extern int ftrace_profile_enable(int);
-extern void ftrace_profile_disable(int);
+static int perf_tp_event_match(struct perf_event *event,
+ struct perf_sample_data *data)
+{
+ void *record = data->raw->data;
+
+ if (likely(!event->filter) || filter_match_preds(event->filter, record))
+ return 1;
+ return 0;
+}
static void tp_perf_event_destroy(struct perf_event *event)
{
@@ -4135,11 +4229,99 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
return &perf_ops_generic;
}
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+ char *filter_str;
+ int ret;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -EINVAL;
+
+ filter_str = strndup_user(arg, PAGE_SIZE);
+ if (IS_ERR(filter_str))
+ return PTR_ERR(filter_str);
+
+ ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+
+ kfree(filter_str);
+ return ret;
+}
+
+static void perf_event_free_filter(struct perf_event *event)
+{
+ ftrace_profile_free_filter(event);
+}
+
#else
+
+static int perf_tp_event_match(struct perf_event *event,
+ struct perf_sample_data *data)
+{
+ return 1;
+}
+
static const struct pmu *tp_perf_event_init(struct perf_event *event)
{
return NULL;
}
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+ return -ENOENT;
+}
+
+static void perf_event_free_filter(struct perf_event *event)
+{
+}
+
+#endif /* CONFIG_EVENT_PROFILE */
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+ release_bp_slot(event);
+}
+
+static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+{
+ int err;
+ /*
+ * The breakpoint is already filled if we haven't created the counter
+ * through perf syscall
+ * FIXME: manage to get trigerred to NULL if it comes from syscalls
+ */
+ if (!bp->callback)
+ err = register_perf_hw_breakpoint(bp);
+ else
+ err = __register_perf_hw_breakpoint(bp);
+ if (err)
+ return ERR_PTR(err);
+
+ bp->destroy = bp_perf_event_destroy;
+
+ return &perf_ops_bp;
+}
+
+void perf_bp_event(struct perf_event *bp, void *data)
+{
+ struct perf_sample_data sample;
+ struct pt_regs *regs = data;
+
+ sample.addr = bp->attr.bp_addr;
+
+ if (!perf_exclude_event(bp, regs))
+ perf_swevent_add(bp, 1, 1, &sample, regs);
+}
+#else
+static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+{
+ return NULL;
+}
+
+void perf_bp_event(struct perf_event *bp, void *regs)
+{
+}
#endif
atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
@@ -4186,6 +4368,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
case PERF_COUNT_SW_CONTEXT_SWITCHES:
case PERF_COUNT_SW_CPU_MIGRATIONS:
+ case PERF_COUNT_SW_ALIGNMENT_FAULTS:
+ case PERF_COUNT_SW_EMULATION_FAULTS:
if (!event->parent) {
atomic_inc(&perf_swevent_enabled[event_id]);
event->destroy = sw_perf_event_destroy;
@@ -4206,6 +4390,7 @@ perf_event_alloc(struct perf_event_attr *attr,
struct perf_event_context *ctx,
struct perf_event *group_leader,
struct perf_event *parent_event,
+ perf_callback_t callback,
gfp_t gfpflags)
{
const struct pmu *pmu;
@@ -4248,6 +4433,11 @@ perf_event_alloc(struct perf_event_attr *attr,
event->state = PERF_EVENT_STATE_INACTIVE;
+ if (!callback && parent_event)
+ callback = parent_event->callback;
+
+ event->callback = callback;
+
if (attr->disabled)
event->state = PERF_EVENT_STATE_OFF;
@@ -4282,6 +4472,11 @@ perf_event_alloc(struct perf_event_attr *attr,
pmu = tp_perf_event_init(event);
break;
+ case PERF_TYPE_BREAKPOINT:
+ pmu = bp_perf_event_init(event);
+ break;
+
+
default:
break;
}
@@ -4394,7 +4589,7 @@ err_size:
goto out;
}
-int perf_event_set_output(struct perf_event *event, int output_fd)
+static int perf_event_set_output(struct perf_event *event, int output_fd)
{
struct perf_event *output_event = NULL;
struct file *output_file = NULL;
@@ -4524,7 +4719,7 @@ SYSCALL_DEFINE5(perf_event_open,
}
event = perf_event_alloc(&attr, cpu, ctx, group_leader,
- NULL, GFP_KERNEL);
+ NULL, NULL, GFP_KERNEL);
err = PTR_ERR(event);
if (IS_ERR(event))
goto err_put_context;
@@ -4572,6 +4767,60 @@ err_put_context:
return err;
}
+/**
+ * perf_event_create_kernel_counter
+ *
+ * @attr: attributes of the counter to create
+ * @cpu: cpu in which the counter is bound
+ * @pid: task to profile
+ */
+struct perf_event *
+perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
+ pid_t pid, perf_callback_t callback)
+{
+ struct perf_event *event;
+ struct perf_event_context *ctx;
+ int err;
+
+ /*
+ * Get the target context (task or percpu):
+ */
+
+ ctx = find_get_context(pid, cpu);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
+ goto err_exit;
+ }
+
+ event = perf_event_alloc(attr, cpu, ctx, NULL,
+ NULL, callback, GFP_KERNEL);
+ if (IS_ERR(event)) {
+ err = PTR_ERR(event);
+ goto err_put_context;
+ }
+
+ event->filp = NULL;
+ WARN_ON_ONCE(ctx->parent_ctx);
+ mutex_lock(&ctx->mutex);
+ perf_install_in_context(ctx, event, cpu);
+ ++ctx->generation;
+ mutex_unlock(&ctx->mutex);
+
+ event->owner = current;
+ get_task_struct(current);
+ mutex_lock(&current->perf_event_mutex);
+ list_add_tail(&event->owner_entry, &current->perf_event_list);
+ mutex_unlock(&current->perf_event_mutex);
+
+ return event;
+
+ err_put_context:
+ put_ctx(ctx);
+ err_exit:
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
+
/*
* inherit a event from parent task to child task:
*/
@@ -4597,7 +4846,7 @@ inherit_event(struct perf_event *parent_event,
child_event = perf_event_alloc(&parent_event->attr,
parent_event->cpu, child_ctx,
group_leader, parent_event,
- GFP_KERNEL);
+ NULL, GFP_KERNEL);
if (IS_ERR(child_event))
return child_event;
get_ctx(child_ctx);
@@ -4615,6 +4864,8 @@ inherit_event(struct perf_event *parent_event,
if (parent_event->attr.freq)
child_event->hw.sample_period = parent_event->hw.sample_period;
+ child_event->overflow_handler = parent_event->overflow_handler;
+
/*
* Link it up in the child's context:
*/
@@ -4704,7 +4955,6 @@ __perf_event_exit_task(struct perf_event *child_event,
{
struct perf_event *parent_event;
- update_event_times(child_event);
perf_event_remove_from_context(child_event);
parent_event = child_event->parent;
@@ -4756,6 +5006,7 @@ void perf_event_exit_task(struct task_struct *child)
* the events from it.
*/
unclone_ctx(child_ctx);
+ update_context_time(child_ctx);
spin_unlock_irqrestore(&child_ctx->lock, flags);
/*
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 04b3a83d686..04a9e90d248 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -693,21 +693,22 @@ static int software_resume(void)
/* The snapshot device should not be opened while we're running */
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
error = -EBUSY;
+ swsusp_close(FMODE_READ);
goto Unlock;
}
pm_prepare_console();
error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
if (error)
- goto Finish;
+ goto close_finish;
error = usermodehelper_disable();
if (error)
- goto Finish;
+ goto close_finish;
error = create_basic_memory_bitmaps();
if (error)
- goto Finish;
+ goto close_finish;
pr_debug("PM: Preparing processes for restore.\n");
error = prepare_processes();
@@ -719,6 +720,7 @@ static int software_resume(void)
pr_debug("PM: Reading hibernation image.\n");
error = swsusp_read(&flags);
+ swsusp_close(FMODE_READ);
if (!error)
hibernation_restore(flags & SF_PLATFORM_MODE);
@@ -737,6 +739,9 @@ static int software_resume(void)
mutex_unlock(&pm_mutex);
pr_debug("PM: Resume from disk failed.\n");
return error;
+close_finish:
+ swsusp_close(FMODE_READ);
+ goto Finish;
}
late_initcall(software_resume);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 17d8bb1acf9..25596e450ac 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -19,7 +19,7 @@
* The time it takes is system-specific though, so when we test this
* during system bootup we allow a LOT of time.
*/
-#define TEST_SUSPEND_SECONDS 5
+#define TEST_SUSPEND_SECONDS 10
static unsigned long suspend_test_start_time;
@@ -49,7 +49,8 @@ void suspend_test_finish(const char *label)
* has some performance issues. The stack dump of a WARN_ON
* is more likely to get the right attention than a printk...
*/
- WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
+ WARN(msec > (TEST_SUSPEND_SECONDS * 1000),
+ "Component: %s, time: %u\n", label, msec);
}
/*
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b101cdc4df3..890f6b11b1d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -314,7 +314,6 @@ static int save_image(struct swap_map_handle *handle,
{
unsigned int m;
int ret;
- int error = 0;
int nr_pages;
int err2;
struct bio *bio;
@@ -329,26 +328,27 @@ static int save_image(struct swap_map_handle *handle,
nr_pages = 0;
bio = NULL;
do_gettimeofday(&start);
- do {
+ while (1) {
ret = snapshot_read_next(snapshot, PAGE_SIZE);
- if (ret > 0) {
- error = swap_write_page(handle, data_of(*snapshot),
- &bio);
- if (error)
- break;
- if (!(nr_pages % m))
- printk("\b\b\b\b%3d%%", nr_pages / m);
- nr_pages++;
- }
- } while (ret > 0);
+ if (ret <= 0)
+ break;
+ ret = swap_write_page(handle, data_of(*snapshot), &bio);
+ if (ret)
+ break;
+ if (!(nr_pages % m))
+ printk("\b\b\b\b%3d%%", nr_pages / m);
+ nr_pages++;
+ }
err2 = wait_on_bio_chain(&bio);
do_gettimeofday(&stop);
- if (!error)
- error = err2;
- if (!error)
+ if (!ret)
+ ret = err2;
+ if (!ret)
printk("\b\b\b\bdone\n");
+ else
+ printk("\n");
swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
- return error;
+ return ret;
}
/**
@@ -536,7 +536,8 @@ static int load_image(struct swap_map_handle *handle,
snapshot_write_finalize(snapshot);
if (!snapshot_image_loaded(snapshot))
error = -ENODATA;
- }
+ } else
+ printk("\n");
swsusp_show_speed(&start, &stop, nr_to_read, "Read");
return error;
}
@@ -572,8 +573,6 @@ int swsusp_read(unsigned int *flags_p)
error = load_image(&handle, &snapshot, header->pages - 1);
release_swap_reader(&handle);
- blkdev_put(resume_bdev, FMODE_READ);
-
if (!error)
pr_debug("PM: Image successfully loaded\n");
else
@@ -596,7 +595,7 @@ int swsusp_check(void)
error = bio_read_page(swsusp_resume_block,
swsusp_header, NULL);
if (error)
- return error;
+ goto put;
if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
@@ -604,8 +603,10 @@ int swsusp_check(void)
error = bio_write_page(swsusp_resume_block,
swsusp_header, NULL);
} else {
- return -EINVAL;
+ error = -EINVAL;
}
+
+put:
if (error)
blkdev_put(resume_bdev, FMODE_READ);
else
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 705f02ac743..f3077c0ab18 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -59,7 +59,7 @@
NUM_RCU_LVL_2, \
NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
}, \
- .signaled = RCU_SIGNAL_INIT, \
+ .signaled = RCU_GP_IDLE, \
.gpnum = -300, \
.completed = -300, \
.onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
@@ -657,14 +657,17 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
* irqs disabled.
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
- spin_lock(&rnp->lock); /* irqs already disabled. */
+ spin_lock(&rnp->lock); /* irqs already disabled. */
rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
rnp->gpnum = rsp->gpnum;
- spin_unlock(&rnp->lock); /* irqs already disabled. */
+ spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
+ rnp = rcu_get_root(rsp);
+ spin_lock(&rnp->lock); /* irqs already disabled. */
rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
+ spin_unlock(&rnp->lock); /* irqs remain disabled. */
spin_unlock_irqrestore(&rsp->onofflock, flags);
}
@@ -706,6 +709,7 @@ static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
{
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
rsp->completed = rsp->gpnum;
+ rsp->signaled = RCU_GP_IDLE;
rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
}
@@ -913,7 +917,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
spin_unlock(&rnp->lock); /* irqs remain disabled. */
break;
}
- rcu_preempt_offline_tasks(rsp, rnp, rdp);
+
+ /*
+ * If there was a task blocking the current grace period,
+ * and if all CPUs have checked in, we need to propagate
+ * the quiescent state up the rcu_node hierarchy. But that
+ * is inconvenient at the moment due to deadlock issues if
+ * this should end the current grace period. So set the
+ * offlined CPU's bit in ->qsmask in order to force the
+ * next force_quiescent_state() invocation to clean up this
+ * mess in a deadlock-free manner.
+ */
+ if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask)
+ rnp->qsmask |= mask;
+
mask = rnp->grpmask;
spin_unlock(&rnp->lock); /* irqs remain disabled. */
rnp = rnp->parent;
@@ -958,7 +975,7 @@ static void rcu_offline_cpu(int cpu)
* Invoke any RCU callbacks that have made it to the end of their grace
* period. Thottle as specified by rdp->blimit.
*/
-static void rcu_do_batch(struct rcu_data *rdp)
+static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
struct rcu_head *next, *list, **tail;
@@ -1011,6 +1028,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
rdp->blimit = blimit;
+ /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
+ if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
+ rdp->qlen_last_fqs_check = 0;
+ rdp->n_force_qs_snap = rsp->n_force_qs;
+ } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
+ rdp->qlen_last_fqs_check = rdp->qlen;
+
local_irq_restore(flags);
/* Re-raise the RCU softirq if there are callbacks remaining. */
@@ -1142,9 +1166,10 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
}
spin_unlock(&rnp->lock);
switch (signaled) {
+ case RCU_GP_IDLE:
case RCU_GP_INIT:
- break; /* grace period still initializing, ignore. */
+ break; /* grace period idle or initializing, ignore. */
case RCU_SAVE_DYNTICK:
@@ -1158,7 +1183,8 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
/* Update state, record completion counter. */
spin_lock(&rnp->lock);
- if (lastcomp == rsp->completed) {
+ if (lastcomp == rsp->completed &&
+ rsp->signaled == RCU_SAVE_DYNTICK) {
rsp->signaled = RCU_FORCE_QS;
dyntick_record_completed(rsp, lastcomp);
}
@@ -1224,7 +1250,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
}
/* If there are callbacks ready, invoke them. */
- rcu_do_batch(rdp);
+ rcu_do_batch(rsp, rdp);
}
/*
@@ -1288,10 +1314,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
}
- /* Force the grace period if too many callbacks or too long waiting. */
- if (unlikely(++rdp->qlen > qhimark)) {
+ /*
+ * Force the grace period if too many callbacks or too long waiting.
+ * Enforce hysteresis, and don't invoke force_quiescent_state()
+ * if some other CPU has recently done so. Also, don't bother
+ * invoking force_quiescent_state() if the newly enqueued callback
+ * is the only one waiting for a grace period to complete.
+ */
+ if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
rdp->blimit = LONG_MAX;
- force_quiescent_state(rsp, 0);
+ if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+ *rdp->nxttail[RCU_DONE_TAIL] != head)
+ force_quiescent_state(rsp, 0);
+ rdp->n_force_qs_snap = rsp->n_force_qs;
+ rdp->qlen_last_fqs_check = rdp->qlen;
} else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
force_quiescent_state(rsp, 1);
local_irq_restore(flags);
@@ -1523,6 +1559,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
rdp->beenonline = 1; /* We have now been online. */
rdp->preemptable = preemptable;
rdp->passed_quiesc_completed = lastcomp - 1;
+ rdp->qlen_last_fqs_check = 0;
+ rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
spin_unlock(&rnp->lock); /* irqs remain disabled. */
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index b40ac570604..1899023b096 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -167,6 +167,10 @@ struct rcu_data {
struct rcu_head *nxtlist;
struct rcu_head **nxttail[RCU_NEXT_SIZE];
long qlen; /* # of queued callbacks */
+ long qlen_last_fqs_check;
+ /* qlen at last check for QS forcing */
+ unsigned long n_force_qs_snap;
+ /* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
#ifdef CONFIG_NO_HZ
@@ -197,9 +201,10 @@ struct rcu_data {
};
/* Values for signaled field in struct rcu_state. */
-#define RCU_GP_INIT 0 /* Grace period being initialized. */
-#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */
-#define RCU_FORCE_QS 2 /* Need to force quiescent state. */
+#define RCU_GP_IDLE 0 /* No grace period in progress. */
+#define RCU_GP_INIT 1 /* Grace period being initialized. */
+#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
+#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
#ifdef CONFIG_NO_HZ
#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
#else /* #ifdef CONFIG_NO_HZ */
@@ -302,9 +307,9 @@ static void rcu_print_task_stall(struct rcu_node *rnp);
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
-static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
- struct rcu_node *rnp,
- struct rcu_data *rdp);
+static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
+ struct rcu_node *rnp,
+ struct rcu_data *rdp);
static void rcu_preempt_offline_cpu(int cpu);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_preempt_check_callbacks(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c0cb783aa16..ef2a58c2b9d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -304,21 +304,25 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
* parent is to remove the need for rcu_read_unlock_special() to
* make more than two attempts to acquire the target rcu_node's lock.
*
+ * Returns 1 if there was previously a task blocking the current grace
+ * period on the specified rcu_node structure.
+ *
* The caller must hold rnp->lock with irqs disabled.
*/
-static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
- struct rcu_node *rnp,
- struct rcu_data *rdp)
+static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
+ struct rcu_node *rnp,
+ struct rcu_data *rdp)
{
int i;
struct list_head *lp;
struct list_head *lp_root;
+ int retval = rcu_preempted_readers(rnp);
struct rcu_node *rnp_root = rcu_get_root(rsp);
struct task_struct *tp;
if (rnp == rnp_root) {
WARN_ONCE(1, "Last CPU thought to be offlined?");
- return; /* Shouldn't happen: at least one CPU online. */
+ return 0; /* Shouldn't happen: at least one CPU online. */
}
WARN_ON_ONCE(rnp != rdp->mynode &&
(!list_empty(&rnp->blocked_tasks[0]) ||
@@ -342,6 +346,8 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
spin_unlock(&rnp_root->lock); /* irqs remain disabled */
}
}
+
+ return retval;
}
/*
@@ -393,6 +399,17 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
EXPORT_SYMBOL_GPL(call_rcu);
/*
+ * Wait for an rcu-preempt grace period. We are supposed to expedite the
+ * grace period, but this is the crude slow compatability hack, so just
+ * invoke synchronize_rcu().
+ */
+void synchronize_rcu_expedited(void)
+{
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/*
* Check to see if there is any immediate preemptable-RCU-related work
* to be done.
*/
@@ -521,12 +538,15 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
/*
* Because preemptable RCU does not exist, it never needs to migrate
- * tasks that were blocked within RCU read-side critical sections.
+ * tasks that were blocked within RCU read-side critical sections, and
+ * such non-existent tasks cannot possibly have been blocking the current
+ * grace period.
*/
-static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
- struct rcu_node *rnp,
- struct rcu_data *rdp)
+static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
+ struct rcu_node *rnp,
+ struct rcu_data *rdp)
{
+ return 0;
}
/*
@@ -565,6 +585,16 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
EXPORT_SYMBOL_GPL(call_rcu);
/*
+ * Wait for an rcu-preempt grace period, but make it happen quickly.
+ * But because preemptable RCU does not exist, map to rcu-sched.
+ */
+void synchronize_rcu_expedited(void)
+{
+ synchronize_sched_expedited();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/*
* Because preemptable RCU does not exist, it never has any work to do.
*/
static int rcu_preempt_pending(int cpu)
diff --git a/kernel/sched.c b/kernel/sched.c
index e88689522e6..3c11ae0a948 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
*/
static DEFINE_SPINLOCK(task_group_lock);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
#ifdef CONFIG_SMP
static int root_task_group_empty(void)
{
@@ -316,7 +318,6 @@ static int root_task_group_empty(void)
}
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_USER_SCHED
# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
#else /* !CONFIG_USER_SCHED */
@@ -1564,11 +1565,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
#ifdef CONFIG_FAIR_GROUP_SCHED
-struct update_shares_data {
- unsigned long rq_weight[NR_CPUS];
-};
-
-static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
+static __read_mostly unsigned long *update_shares_data;
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -1578,12 +1575,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
static void update_group_shares_cpu(struct task_group *tg, int cpu,
unsigned long sd_shares,
unsigned long sd_rq_weight,
- struct update_shares_data *usd)
+ unsigned long *usd_rq_weight)
{
unsigned long shares, rq_weight;
int boost = 0;
- rq_weight = usd->rq_weight[cpu];
+ rq_weight = usd_rq_weight[cpu];
if (!rq_weight) {
boost = 1;
rq_weight = NICE_0_LOAD;
@@ -1618,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
static int tg_shares_up(struct task_group *tg, void *data)
{
unsigned long weight, rq_weight = 0, shares = 0;
- struct update_shares_data *usd;
+ unsigned long *usd_rq_weight;
struct sched_domain *sd = data;
unsigned long flags;
int i;
@@ -1627,11 +1624,11 @@ static int tg_shares_up(struct task_group *tg, void *data)
return 0;
local_irq_save(flags);
- usd = &__get_cpu_var(update_shares_data);
+ usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
for_each_cpu(i, sched_domain_span(sd)) {
weight = tg->cfs_rq[i]->load.weight;
- usd->rq_weight[i] = weight;
+ usd_rq_weight[i] = weight;
/*
* If there are currently no tasks on the cpu pretend there
@@ -1652,7 +1649,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
shares = tg->shares;
for_each_cpu(i, sched_domain_span(sd))
- update_group_shares_cpu(tg, i, shares, rq_weight, usd);
+ update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
local_irq_restore(flags);
@@ -1996,6 +1993,38 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
p->sched_class->prio_changed(rq, p, oldprio, running);
}
+/**
+ * kthread_bind - bind a just-created kthread to a cpu.
+ * @p: thread created by kthread_create().
+ * @cpu: cpu (might not be online, must be possible) for @k to run on.
+ *
+ * Description: This function is equivalent to set_cpus_allowed(),
+ * except that @cpu doesn't need to be online, and the thread must be
+ * stopped (i.e., just returned from kthread_create()).
+ *
+ * Function lives here instead of kthread.c because it messes with
+ * scheduler internals which require locking.
+ */
+void kthread_bind(struct task_struct *p, unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ /* Must have done schedule() in kthread() before we set_task_cpu */
+ if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
+ WARN_ON(1);
+ return;
+ }
+
+ spin_lock_irqsave(&rq->lock, flags);
+ set_task_cpu(p, cpu);
+ p->cpus_allowed = cpumask_of_cpu(cpu);
+ p->rt.nr_cpus_allowed = 1;
+ p->flags |= PF_THREAD_BOUND;
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
+EXPORT_SYMBOL(kthread_bind);
+
#ifdef CONFIG_SMP
/*
* Is this task likely cache-hot:
@@ -2008,7 +2037,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
/*
* Buddy candidates are cache hot:
*/
- if (sched_feat(CACHE_HOT_BUDDY) &&
+ if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
(&p->se == cfs_rq_of(&p->se)->next ||
&p->se == cfs_rq_of(&p->se)->last))
return 1;
@@ -9407,6 +9436,10 @@ void __init sched_init(void)
#endif /* CONFIG_USER_SCHED */
#endif /* CONFIG_GROUP_SCHED */
+#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
+ update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
+ __alignof__(unsigned long));
+#endif
for_each_possible_cpu(i) {
struct rq *rq;
@@ -9532,13 +9565,13 @@ void __init sched_init(void)
current->sched_class = &fair_sched_class;
/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
- alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ
- alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
#endif
- alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+ zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
#endif /* SMP */
perf_event_init();
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4e777b47eed..37087a7fac2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -822,6 +822,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
* re-elected due to buddy favours.
*/
clear_buddies(cfs_rq, curr);
+ return;
+ }
+
+ /*
+ * Ensure that a task that missed wakeup preemption by a
+ * narrow margin doesn't have to wait for a full slice.
+ * This also mitigates buddy induced latencies under load.
+ */
+ if (!sched_feat(WAKEUP_PREEMPT))
+ return;
+
+ if (delta_exec < sysctl_sched_min_granularity)
+ return;
+
+ if (cfs_rq->nr_running > 1) {
+ struct sched_entity *se = __pick_next_entity(cfs_rq);
+ s64 delta = curr->vruntime - se->vruntime;
+
+ if (delta > ideal_runtime)
+ resched_task(rq_of(cfs_rq)->curr);
}
}
@@ -861,12 +881,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = __pick_next_entity(cfs_rq);
+ struct sched_entity *left = se;
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
- return cfs_rq->next;
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+ se = cfs_rq->next;
- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
- return cfs_rq->last;
+ /*
+ * Prefer last buddy, try to return the CPU to a preempted task.
+ */
+ if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
+ se = cfs_rq->last;
+
+ clear_buddies(cfs_rq, se);
return se;
}
@@ -1568,6 +1594,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
int sync = wake_flags & WF_SYNC;
+ int scale = cfs_rq->nr_running >= sched_nr_latency;
update_curr(cfs_rq);
@@ -1582,18 +1609,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (unlikely(se == pse))
return;
- /*
- * Only set the backward buddy when the current task is still on the
- * rq. This can happen when a wakeup gets interleaved with schedule on
- * the ->pre_schedule() or idle_balance() point, either of which can
- * drop the rq lock.
- *
- * Also, during early boot the idle thread is in the fair class, for
- * obvious reasons its a bad idea to schedule back to the idle thread.
- */
- if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
- set_last_buddy(se);
- if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
+ if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
set_next_buddy(pse);
/*
@@ -1639,8 +1655,22 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
BUG_ON(!pse);
- if (wakeup_preempt_entity(se, pse) == 1)
+ if (wakeup_preempt_entity(se, pse) == 1) {
resched_task(curr);
+ /*
+ * Only set the backward buddy when the current task is still
+ * on the rq. This can happen when a wakeup gets interleaved
+ * with schedule on the ->pre_schedule() or idle_balance()
+ * point, either of which can * drop the rq lock.
+ *
+ * Also, during early boot the idle thread is in the fair class,
+ * for obvious reasons its a bad idea to schedule back to it.
+ */
+ if (unlikely(!se->on_rq || curr == rq->idle))
+ return;
+ if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+ set_last_buddy(se);
+ }
}
static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1654,16 +1684,6 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
do {
se = pick_next_entity(cfs_rq);
- /*
- * If se was a buddy, clear it so that it will have to earn
- * the favour again.
- *
- * If se was not a buddy, clear the buddies because neither
- * was elegible to run, let them earn it again.
- *
- * IOW. unconditionally clear buddies.
- */
- __clear_buddies(cfs_rq, NULL);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
diff --git a/kernel/signal.c b/kernel/signal.c
index 6705320784f..93e72e5feae 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,7 +27,8 @@
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
-#include <trace/events/sched.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/signal.h>
#include <asm/param.h>
#include <asm/uaccess.h>
@@ -834,7 +835,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
struct sigqueue *q;
int override_rlimit;
- trace_sched_signal_send(sig, t);
+ trace_signal_generate(sig, info, t);
assert_spin_locked(&t->sighand->siglock);
@@ -896,12 +897,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
break;
}
} else if (!is_si_special(info)) {
- if (sig >= SIGRTMIN && info->si_code != SI_USER)
- /*
- * Queue overflow, abort. We may abort if the signal was rt
- * and sent by user using something other than kill().
- */
+ if (sig >= SIGRTMIN && info->si_code != SI_USER) {
+ /*
+ * Queue overflow, abort. We may abort if the
+ * signal was rt and sent by user using something
+ * other than kill().
+ */
+ trace_signal_overflow_fail(sig, group, info);
return -EAGAIN;
+ } else {
+ /*
+ * This is a silent loss of information. We still
+ * send the signal, but the *info bits are lost.
+ */
+ trace_signal_lose_info(sig, group, info);
+ }
}
out_set:
@@ -1839,6 +1849,9 @@ relock:
ka = &sighand->action[signr-1];
}
+ /* Trace actually delivered signals. */
+ trace_signal_deliver(signr, info, ka);
+
if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
continue;
if (ka->sa.sa_handler != SIG_DFL) {
diff --git a/kernel/sys.c b/kernel/sys.c
index 255475d163e..ce17760d9c5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1110,6 +1110,8 @@ SYSCALL_DEFINE0(setsid)
err = session;
out:
write_unlock_irq(&tasklist_lock);
+ if (err > 0)
+ proc_sid_connector(group_leader);
return err;
}
@@ -1546,24 +1548,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
if (arg4 | arg5)
return -EINVAL;
switch (arg2) {
- case 0:
+ case PR_MCE_KILL_CLEAR:
if (arg3 != 0)
return -EINVAL;
current->flags &= ~PF_MCE_PROCESS;
break;
- case 1:
+ case PR_MCE_KILL_SET:
current->flags |= PF_MCE_PROCESS;
- if (arg3 != 0)
+ if (arg3 == PR_MCE_KILL_EARLY)
current->flags |= PF_MCE_EARLY;
- else
+ else if (arg3 == PR_MCE_KILL_LATE)
current->flags &= ~PF_MCE_EARLY;
+ else if (arg3 == PR_MCE_KILL_DEFAULT)
+ current->flags &=
+ ~(PF_MCE_EARLY|PF_MCE_PROCESS);
+ else
+ return -EINVAL;
break;
default:
return -EINVAL;
}
error = 0;
break;
-
+ case PR_MCE_KILL_GET:
+ if (arg2 | arg3 | arg4 | arg5)
+ return -EINVAL;
+ if (current->flags & PF_MCE_PROCESS)
+ error = (current->flags & PF_MCE_EARLY) ?
+ PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+ else
+ error = PR_MCE_KILL_DEFAULT;
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b38423ca711..b6e7aaea460 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1521,7 +1521,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
if (!table->ctl_name && table->strategy)
set_fail(&fail, table, "Strategy without ctl_name");
#endif
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
if (table->procname && !table->proc_handler)
set_fail(&fail, table, "No proc_handler");
#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b416512ad17..d006554888d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -339,6 +339,27 @@ config POWER_TRACER
power management decisions, specifically the C-state and P-state
behavior.
+config KSYM_TRACER
+ bool "Trace read and write access on kernel memory locations"
+ depends on HAVE_HW_BREAKPOINT
+ select TRACING
+ help
+ This tracer helps find read and write operations on any given kernel
+ symbol i.e. /proc/kallsyms.
+
+config PROFILE_KSYM_TRACER
+ bool "Profile all kernel memory accesses on 'watched' variables"
+ depends on KSYM_TRACER
+ help
+ This tracer profiles kernel accesses on variables watched through the
+ ksym tracer ftrace plugin. Depending upon the hardware, all read
+ and write operations on kernel variables can be monitored for
+ accesses.
+
+ The results will be displayed in:
+ /debugfs/tracing/profile_ksym
+
+ Say N if unsure.
config STACK_TRACER
bool "Trace max stack"
@@ -428,6 +449,23 @@ config BLK_DEV_IO_TRACE
If unsure, say N.
+config KPROBE_EVENT
+ depends on KPROBES
+ depends on X86
+ bool "Enable kprobes-based dynamic events"
+ select TRACING
+ default y
+ help
+ This allows the user to add tracing events (similar to tracepoints) on the fly
+ via the ftrace interface. See Documentation/trace/kprobetrace.txt
+ for more details.
+
+ Those events can be inserted wherever kprobes can probe, and record
+ various register and memory values.
+
+ This option is also required by perf-probe subcommand of perf tools. If
+ you want to use perf tools, this option is strongly recommended.
+
config DYNAMIC_FTRACE
bool "enable/disable ftrace tracepoints dynamically"
depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 26f03ac07c2..cd9ecd89ec7 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,8 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
+obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
obj-$(CONFIG_EVENT_TRACING) += power-traces.o
libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 37ba67e3326..7cb6f192259 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -60,6 +60,13 @@ static int last_ftrace_enabled;
/* Quick disabling of function tracer. */
int function_trace_stop;
+/* List for set_ftrace_pid's pids. */
+LIST_HEAD(ftrace_pids);
+struct ftrace_pid {
+ struct list_head list;
+ struct pid *pid;
+};
+
/*
* ftrace_disabled is set when an anomaly is discovered.
* ftrace_disabled is much stronger than ftrace_enabled.
@@ -78,6 +85,10 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
+#endif
+
static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
{
struct ftrace_ops *op = ftrace_list;
@@ -155,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
else
func = ftrace_list_func;
- if (ftrace_pid_trace) {
+ if (!list_empty(&ftrace_pids)) {
set_ftrace_pid_function(func);
func = ftrace_pid_func;
}
@@ -203,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
if (ftrace_list->next == &ftrace_list_end) {
ftrace_func_t func = ftrace_list->func;
- if (ftrace_pid_trace) {
+ if (!list_empty(&ftrace_pids)) {
set_ftrace_pid_function(func);
func = ftrace_pid_func;
}
@@ -231,7 +242,7 @@ static void ftrace_update_pid_func(void)
func = __ftrace_trace_function;
#endif
- if (ftrace_pid_trace) {
+ if (!list_empty(&ftrace_pids)) {
set_ftrace_pid_function(func);
func = ftrace_pid_func;
} else {
@@ -740,7 +751,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
out:
mutex_unlock(&ftrace_profile_lock);
- filp->f_pos += cnt;
+ *ppos += cnt;
return cnt;
}
@@ -821,8 +832,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
}
#endif /* CONFIG_FUNCTION_PROFILER */
-/* set when tracing only a pid */
-struct pid *ftrace_pid_trace;
static struct pid * const ftrace_swapper_pid = &init_struct_pid;
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -1261,12 +1270,34 @@ static int ftrace_update_code(struct module *mod)
ftrace_new_addrs = p->newlist;
p->flags = 0L;
- /* convert record (i.e, patch mcount-call with NOP) */
- if (ftrace_code_disable(mod, p)) {
- p->flags |= FTRACE_FL_CONVERTED;
- ftrace_update_cnt++;
- } else
+ /*
+ * Do the initial record convertion from mcount jump
+ * to the NOP instructions.
+ */
+ if (!ftrace_code_disable(mod, p)) {
ftrace_free_rec(p);
+ continue;
+ }
+
+ p->flags |= FTRACE_FL_CONVERTED;
+ ftrace_update_cnt++;
+
+ /*
+ * If the tracing is enabled, go ahead and enable the record.
+ *
+ * The reason not to enable the record immediatelly is the
+ * inherent check of ftrace_make_nop/ftrace_make_call for
+ * correct previous instructions. Making first the NOP
+ * conversion puts the module to the correct state, thus
+ * passing the ftrace_make_call check.
+ */
+ if (ftrace_start_up) {
+ int failed = __ftrace_replace_code(p, 1);
+ if (failed) {
+ ftrace_bug(failed, p->ip);
+ ftrace_free_rec(p);
+ }
+ }
}
stop = ftrace_now(raw_smp_processor_id());
@@ -1656,60 +1687,6 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
return ret;
}
-enum {
- MATCH_FULL,
- MATCH_FRONT_ONLY,
- MATCH_MIDDLE_ONLY,
- MATCH_END_ONLY,
-};
-
-/*
- * (static function - no need for kernel doc)
- *
- * Pass in a buffer containing a glob and this function will
- * set search to point to the search part of the buffer and
- * return the type of search it is (see enum above).
- * This does modify buff.
- *
- * Returns enum type.
- * search returns the pointer to use for comparison.
- * not returns 1 if buff started with a '!'
- * 0 otherwise.
- */
-static int
-ftrace_setup_glob(char *buff, int len, char **search, int *not)
-{
- int type = MATCH_FULL;
- int i;
-
- if (buff[0] == '!') {
- *not = 1;
- buff++;
- len--;
- } else
- *not = 0;
-
- *search = buff;
-
- for (i = 0; i < len; i++) {
- if (buff[i] == '*') {
- if (!i) {
- *search = buff + 1;
- type = MATCH_END_ONLY;
- } else {
- if (type == MATCH_END_ONLY)
- type = MATCH_MIDDLE_ONLY;
- else
- type = MATCH_FRONT_ONLY;
- buff[i] = 0;
- break;
- }
- }
- }
-
- return type;
-}
-
static int ftrace_match(char *str, char *regex, int len, int type)
{
int matched = 0;
@@ -1758,7 +1735,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
int not;
flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
- type = ftrace_setup_glob(buff, len, &search, &not);
+ type = filter_parse_regex(buff, len, &search, &not);
search_len = strlen(search);
@@ -1826,7 +1803,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
}
if (strlen(buff)) {
- type = ftrace_setup_glob(buff, strlen(buff), &search, &not);
+ type = filter_parse_regex(buff, strlen(buff), &search, &not);
search_len = strlen(search);
}
@@ -1991,7 +1968,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
int count = 0;
char *search;
- type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+ type = filter_parse_regex(glob, strlen(glob), &search, &not);
len = strlen(search);
/* we do not support '!' for function probes */
@@ -2068,7 +2045,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
else if (glob) {
int not;
- type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+ type = filter_parse_regex(glob, strlen(glob), &search, &not);
len = strlen(search);
/* we do not support '!' for function probes */
@@ -2222,15 +2199,15 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
ret = ftrace_process_regex(parser->buffer,
parser->idx, enable);
if (ret)
- goto out;
+ goto out_unlock;
trace_parser_clear(parser);
}
ret = read;
-
+out_unlock:
mutex_unlock(&ftrace_regex_lock);
-out:
+
return ret;
}
@@ -2297,6 +2274,7 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE
static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
static int __init set_ftrace_notrace(char *str)
{
@@ -2312,6 +2290,31 @@ static int __init set_ftrace_filter(char *str)
}
__setup("ftrace_filter=", set_ftrace_filter);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int __init set_graph_function(char *str)
+{
+ strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
+ return 1;
+}
+__setup("ftrace_graph_filter=", set_graph_function);
+
+static void __init set_ftrace_early_graph(char *buf)
+{
+ int ret;
+ char *func;
+
+ while (buf) {
+ func = strsep(&buf, ",");
+ /* we allow only one expression at a time */
+ ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
+ func);
+ if (ret)
+ printk(KERN_DEBUG "ftrace: function %s not "
+ "traceable\n", func);
+ }
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
static void __init set_ftrace_early_filter(char *buf, int enable)
{
char *func;
@@ -2328,6 +2331,10 @@ static void __init set_ftrace_early_filters(void)
set_ftrace_early_filter(ftrace_filter_buf, 1);
if (ftrace_notrace_buf[0])
set_ftrace_early_filter(ftrace_notrace_buf, 0);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ if (ftrace_graph_buf[0])
+ set_ftrace_early_graph(ftrace_graph_buf);
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
}
static int
@@ -2513,7 +2520,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
return -ENODEV;
/* decode regex */
- type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not);
+ type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
if (not)
return -EINVAL;
@@ -2624,7 +2631,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
return 0;
}
-static int ftrace_convert_nops(struct module *mod,
+static int ftrace_process_locs(struct module *mod,
unsigned long *start,
unsigned long *end)
{
@@ -2684,7 +2691,7 @@ static void ftrace_init_module(struct module *mod,
{
if (ftrace_disabled || start == end)
return;
- ftrace_convert_nops(mod, start, end);
+ ftrace_process_locs(mod, start, end);
}
static int ftrace_module_notify(struct notifier_block *self,
@@ -2745,7 +2752,7 @@ void __init ftrace_init(void)
last_ftrace_enabled = ftrace_enabled = 1;
- ret = ftrace_convert_nops(NULL,
+ ret = ftrace_process_locs(NULL,
__start_mcount_loc,
__stop_mcount_loc);
@@ -2778,23 +2785,6 @@ static inline void ftrace_startup_enable(int command) { }
# define ftrace_shutdown_sysctl() do { } while (0)
#endif /* CONFIG_DYNAMIC_FTRACE */
-static ssize_t
-ftrace_pid_read(struct file *file, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[64];
- int r;
-
- if (ftrace_pid_trace == ftrace_swapper_pid)
- r = sprintf(buf, "swapper tasks\n");
- else if (ftrace_pid_trace)
- r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
- else
- r = sprintf(buf, "no pid\n");
-
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
static void clear_ftrace_swapper(void)
{
struct task_struct *p;
@@ -2845,14 +2835,12 @@ static void set_ftrace_pid(struct pid *pid)
rcu_read_unlock();
}
-static void clear_ftrace_pid_task(struct pid **pid)
+static void clear_ftrace_pid_task(struct pid *pid)
{
- if (*pid == ftrace_swapper_pid)
+ if (pid == ftrace_swapper_pid)
clear_ftrace_swapper();
else
- clear_ftrace_pid(*pid);
-
- *pid = NULL;
+ clear_ftrace_pid(pid);
}
static void set_ftrace_pid_task(struct pid *pid)
@@ -2863,11 +2851,140 @@ static void set_ftrace_pid_task(struct pid *pid)
set_ftrace_pid(pid);
}
+static int ftrace_pid_add(int p)
+{
+ struct pid *pid;
+ struct ftrace_pid *fpid;
+ int ret = -EINVAL;
+
+ mutex_lock(&ftrace_lock);
+
+ if (!p)
+ pid = ftrace_swapper_pid;
+ else
+ pid = find_get_pid(p);
+
+ if (!pid)
+ goto out;
+
+ ret = 0;
+
+ list_for_each_entry(fpid, &ftrace_pids, list)
+ if (fpid->pid == pid)
+ goto out_put;
+
+ ret = -ENOMEM;
+
+ fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
+ if (!fpid)
+ goto out_put;
+
+ list_add(&fpid->list, &ftrace_pids);
+ fpid->pid = pid;
+
+ set_ftrace_pid_task(pid);
+
+ ftrace_update_pid_func();
+ ftrace_startup_enable(0);
+
+ mutex_unlock(&ftrace_lock);
+ return 0;
+
+out_put:
+ if (pid != ftrace_swapper_pid)
+ put_pid(pid);
+
+out:
+ mutex_unlock(&ftrace_lock);
+ return ret;
+}
+
+static void ftrace_pid_reset(void)
+{
+ struct ftrace_pid *fpid, *safe;
+
+ mutex_lock(&ftrace_lock);
+ list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
+ struct pid *pid = fpid->pid;
+
+ clear_ftrace_pid_task(pid);
+
+ list_del(&fpid->list);
+ kfree(fpid);
+ }
+
+ ftrace_update_pid_func();
+ ftrace_startup_enable(0);
+
+ mutex_unlock(&ftrace_lock);
+}
+
+static void *fpid_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&ftrace_lock);
+
+ if (list_empty(&ftrace_pids) && (!*pos))
+ return (void *) 1;
+
+ return seq_list_start(&ftrace_pids, *pos);
+}
+
+static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ if (v == (void *)1)
+ return NULL;
+
+ return seq_list_next(v, &ftrace_pids, pos);
+}
+
+static void fpid_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&ftrace_lock);
+}
+
+static int fpid_show(struct seq_file *m, void *v)
+{
+ const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
+
+ if (v == (void *)1) {
+ seq_printf(m, "no pid\n");
+ return 0;
+ }
+
+ if (fpid->pid == ftrace_swapper_pid)
+ seq_printf(m, "swapper tasks\n");
+ else
+ seq_printf(m, "%u\n", pid_vnr(fpid->pid));
+
+ return 0;
+}
+
+static const struct seq_operations ftrace_pid_sops = {
+ .start = fpid_start,
+ .next = fpid_next,
+ .stop = fpid_stop,
+ .show = fpid_show,
+};
+
+static int
+ftrace_pid_open(struct inode *inode, struct file *file)
+{
+ int ret = 0;
+
+ if ((file->f_mode & FMODE_WRITE) &&
+ (file->f_flags & O_TRUNC))
+ ftrace_pid_reset();
+
+ if (file->f_mode & FMODE_READ)
+ ret = seq_open(file, &ftrace_pid_sops);
+
+ return ret;
+}
+
static ssize_t
ftrace_pid_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct pid *pid;
char buf[64];
long val;
int ret;
@@ -2880,57 +2997,38 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
buf[cnt] = 0;
+ /*
+ * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
+ * to clean the filter quietly.
+ */
+ strstrip(buf);
+ if (strlen(buf) == 0)
+ return 1;
+
ret = strict_strtol(buf, 10, &val);
if (ret < 0)
return ret;
- mutex_lock(&ftrace_lock);
- if (val < 0) {
- /* disable pid tracing */
- if (!ftrace_pid_trace)
- goto out;
-
- clear_ftrace_pid_task(&ftrace_pid_trace);
-
- } else {
- /* swapper task is special */
- if (!val) {
- pid = ftrace_swapper_pid;
- if (pid == ftrace_pid_trace)
- goto out;
- } else {
- pid = find_get_pid(val);
-
- if (pid == ftrace_pid_trace) {
- put_pid(pid);
- goto out;
- }
- }
-
- if (ftrace_pid_trace)
- clear_ftrace_pid_task(&ftrace_pid_trace);
+ ret = ftrace_pid_add(val);
- if (!pid)
- goto out;
-
- ftrace_pid_trace = pid;
-
- set_ftrace_pid_task(ftrace_pid_trace);
- }
-
- /* update the function call */
- ftrace_update_pid_func();
- ftrace_startup_enable(0);
+ return ret ? ret : cnt;
+}
- out:
- mutex_unlock(&ftrace_lock);
+static int
+ftrace_pid_release(struct inode *inode, struct file *file)
+{
+ if (file->f_mode & FMODE_READ)
+ seq_release(inode, file);
- return cnt;
+ return 0;
}
static const struct file_operations ftrace_pid_fops = {
- .read = ftrace_pid_read,
- .write = ftrace_pid_write,
+ .open = ftrace_pid_open,
+ .write = ftrace_pid_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = ftrace_pid_release,
};
static __init int ftrace_init_debugfs(void)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e43c928356e..db223fe8887 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -486,7 +486,7 @@ struct ring_buffer_iter {
/* Up this if you want to test the TIME_EXTENTS and normalization */
#define DEBUG_SHIFT 0
-static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
+static inline u64 rb_time_stamp(struct ring_buffer *buffer)
{
/* shift to debug/test normalization and TIME_EXTENTS */
return buffer->clock() << DEBUG_SHIFT;
@@ -497,7 +497,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
u64 time;
preempt_disable_notrace();
- time = rb_time_stamp(buffer, cpu);
+ time = rb_time_stamp(buffer);
preempt_enable_no_resched_notrace();
return time;
@@ -602,7 +602,7 @@ static struct list_head *rb_list_head(struct list_head *list)
}
/*
- * rb_is_head_page - test if the give page is the head page
+ * rb_is_head_page - test if the given page is the head page
*
* Because the reader may move the head_page pointer, we can
* not trust what the head page is (it may be pointing to
@@ -1196,6 +1196,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
atomic_inc(&cpu_buffer->record_disabled);
synchronize_sched();
+ spin_lock_irq(&cpu_buffer->reader_lock);
rb_head_page_deactivate(cpu_buffer);
for (i = 0; i < nr_pages; i++) {
@@ -1210,6 +1211,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
return;
rb_reset_cpu(cpu_buffer);
+ spin_unlock_irq(&cpu_buffer->reader_lock);
rb_check_pages(cpu_buffer);
@@ -1871,7 +1873,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
* Nested commits always have zero deltas, so
* just reread the time stamp
*/
- *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
+ *ts = rb_time_stamp(buffer);
next_page->page->time_stamp = *ts;
}
@@ -2114,7 +2116,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
goto out_fail;
- ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
+ ts = rb_time_stamp(cpu_buffer->buffer);
/*
* Only the first commit can update the timestamp.
@@ -2684,7 +2686,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
EXPORT_SYMBOL_GPL(ring_buffer_entries);
/**
- * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * ring_buffer_overruns - get the number of overruns in buffer
* @buffer: The ring buffer
*
* Returns the total number of overruns in the ring buffer
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c820b0310a1..9d3067a62d4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -129,7 +129,7 @@ static int tracing_set_tracer(const char *buf);
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
-static int __init set_ftrace(char *str)
+static int __init set_cmdline_ftrace(char *str)
{
strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
default_bootup_tracer = bootup_tracer_buf;
@@ -137,7 +137,7 @@ static int __init set_ftrace(char *str)
ring_buffer_expanded = 1;
return 1;
}
-__setup("ftrace=", set_ftrace);
+__setup("ftrace=", set_cmdline_ftrace);
static int __init set_ftrace_dump_on_oops(char *str)
{
@@ -2440,7 +2440,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
return ret;
}
- filp->f_pos += cnt;
+ *ppos += cnt;
return cnt;
}
@@ -2582,7 +2582,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
}
mutex_unlock(&trace_types_lock);
- filp->f_pos += cnt;
+ *ppos += cnt;
return cnt;
}
@@ -2764,7 +2764,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
if (err)
return err;
- filp->f_pos += ret;
+ *ppos += ret;
return ret;
}
@@ -3299,7 +3299,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
}
}
- filp->f_pos += cnt;
+ *ppos += cnt;
/* If check pages failed, return ENOMEM */
if (tracing_disabled)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 405cb850b75..1d7f4830a80 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,6 +11,7 @@
#include <linux/ftrace.h>
#include <trace/boot.h>
#include <linux/kmemtrace.h>
+#include <linux/hw_breakpoint.h>
#include <linux/trace_seq.h>
#include <linux/ftrace_event.h>
@@ -37,6 +38,7 @@ enum trace_type {
TRACE_KMEM_ALLOC,
TRACE_KMEM_FREE,
TRACE_BLK,
+ TRACE_KSYM,
__TRACE_LAST_TYPE,
};
@@ -98,9 +100,32 @@ struct syscall_trace_enter {
struct syscall_trace_exit {
struct trace_entry ent;
int nr;
- unsigned long ret;
+ long ret;
};
+struct kprobe_trace_entry {
+ struct trace_entry ent;
+ unsigned long ip;
+ int nargs;
+ unsigned long args[];
+};
+
+#define SIZEOF_KPROBE_TRACE_ENTRY(n) \
+ (offsetof(struct kprobe_trace_entry, args) + \
+ (sizeof(unsigned long) * (n)))
+
+struct kretprobe_trace_entry {
+ struct trace_entry ent;
+ unsigned long func;
+ unsigned long ret_ip;
+ int nargs;
+ unsigned long args[];
+};
+
+#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
+ (offsetof(struct kretprobe_trace_entry, args) + \
+ (sizeof(unsigned long) * (n)))
+
/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
@@ -209,6 +234,7 @@ extern void __ftrace_bad_type(void);
TRACE_KMEM_ALLOC); \
IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
TRACE_KMEM_FREE); \
+ IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
__ftrace_bad_type(); \
} while (0)
@@ -364,6 +390,8 @@ int register_tracer(struct tracer *type);
void unregister_tracer(struct tracer *type);
int is_tracing_stopped(void);
+extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
+
extern unsigned long nsecs_to_usecs(unsigned long nsecs);
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -438,6 +466,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_hw_branches(struct tracer *trace,
struct trace_array *tr);
+extern int trace_selftest_startup_ksym(struct tracer *trace,
+ struct trace_array *tr);
#endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data);
@@ -483,10 +513,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
return 0;
}
#else
-static inline int ftrace_trace_addr(unsigned long addr)
-{
- return 1;
-}
static inline int ftrace_graph_addr(unsigned long addr)
{
return 1;
@@ -500,12 +526,12 @@ print_graph_function(struct trace_iterator *iter)
}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-extern struct pid *ftrace_pid_trace;
+extern struct list_head ftrace_pids;
#ifdef CONFIG_FUNCTION_TRACER
static inline int ftrace_trace_task(struct task_struct *task)
{
- if (!ftrace_pid_trace)
+ if (list_empty(&ftrace_pids))
return 1;
return test_tsk_trace_trace(task);
@@ -687,7 +713,6 @@ struct event_filter {
int n_preds;
struct filter_pred **preds;
char *filter_string;
- bool no_reset;
};
struct event_subsystem {
@@ -699,22 +724,40 @@ struct event_subsystem {
};
struct filter_pred;
+struct regex;
typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
int val1, int val2);
+typedef int (*regex_match_func)(char *str, struct regex *r, int len);
+
+enum regex_type {
+ MATCH_FULL = 0,
+ MATCH_FRONT_ONLY,
+ MATCH_MIDDLE_ONLY,
+ MATCH_END_ONLY,
+};
+
+struct regex {
+ char pattern[MAX_FILTER_STR_VAL];
+ int len;
+ int field_len;
+ regex_match_func match;
+};
+
struct filter_pred {
- filter_pred_fn_t fn;
- u64 val;
- char str_val[MAX_FILTER_STR_VAL];
- int str_len;
- char *field_name;
- int offset;
- int not;
- int op;
- int pop_n;
+ filter_pred_fn_t fn;
+ u64 val;
+ struct regex regex;
+ char *field_name;
+ int offset;
+ int not;
+ int op;
+ int pop_n;
};
+extern enum regex_type
+filter_parse_regex(char *buff, int len, char **search, int *not);
extern void print_event_filter(struct ftrace_event_call *call,
struct trace_seq *s);
extern int apply_event_filter(struct ftrace_event_call *call,
@@ -730,7 +773,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
struct ring_buffer *buffer,
struct ring_buffer_event *event)
{
- if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
+ if (unlikely(call->filter_active) &&
+ !filter_match_preds(call->filter, rec)) {
ring_buffer_discard_commit(buffer, event);
return 1;
}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ead3d724599..c16a08f399d 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
F_printk("type:%u call_site:%lx ptr:%p",
__entry->type_id, __entry->call_site, __entry->ptr)
);
+
+FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
+
+ TRACE_KSYM,
+
+ F_STRUCT(
+ __field( unsigned long, ip )
+ __field( unsigned char, type )
+ __array( char , cmd, TASK_COMM_LEN )
+ __field( unsigned long, addr )
+ ),
+
+ F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
+ (void *)__entry->ip, (unsigned int)__entry->type,
+ (void *)__entry->addr, __entry->cmd)
+);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 8d5c171cc99..d9c60f80aa0 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,17 +8,14 @@
#include <linux/module.h>
#include "trace.h"
-/*
- * We can't use a size but a type in alloc_percpu()
- * So let's create a dummy type that matches the desired size
- */
-typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
-char *trace_profile_buf;
-EXPORT_SYMBOL_GPL(trace_profile_buf);
+char *perf_trace_buf;
+EXPORT_SYMBOL_GPL(perf_trace_buf);
+
+char *perf_trace_buf_nmi;
+EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
-char *trace_profile_buf_nmi;
-EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
+typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
/* Count the events in use (per event id, not per instance) */
static int total_profile_count;
@@ -32,20 +29,20 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
return 0;
if (!total_profile_count) {
- buf = (char *)alloc_percpu(profile_buf_t);
+ buf = (char *)alloc_percpu(perf_trace_t);
if (!buf)
goto fail_buf;
- rcu_assign_pointer(trace_profile_buf, buf);
+ rcu_assign_pointer(perf_trace_buf, buf);
- buf = (char *)alloc_percpu(profile_buf_t);
+ buf = (char *)alloc_percpu(perf_trace_t);
if (!buf)
goto fail_buf_nmi;
- rcu_assign_pointer(trace_profile_buf_nmi, buf);
+ rcu_assign_pointer(perf_trace_buf_nmi, buf);
}
- ret = event->profile_enable();
+ ret = event->profile_enable(event);
if (!ret) {
total_profile_count++;
return 0;
@@ -53,10 +50,10 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
fail_buf_nmi:
if (!total_profile_count) {
- free_percpu(trace_profile_buf_nmi);
- free_percpu(trace_profile_buf);
- trace_profile_buf_nmi = NULL;
- trace_profile_buf = NULL;
+ free_percpu(perf_trace_buf_nmi);
+ free_percpu(perf_trace_buf);
+ perf_trace_buf_nmi = NULL;
+ perf_trace_buf = NULL;
}
fail_buf:
atomic_dec(&event->profile_count);
@@ -89,14 +86,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
if (!atomic_add_negative(-1, &event->profile_count))
return;
- event->profile_disable();
+ event->profile_disable(event);
if (!--total_profile_count) {
- buf = trace_profile_buf;
- rcu_assign_pointer(trace_profile_buf, NULL);
+ buf = perf_trace_buf;
+ rcu_assign_pointer(perf_trace_buf, NULL);
- nmi_buf = trace_profile_buf_nmi;
- rcu_assign_pointer(trace_profile_buf_nmi, NULL);
+ nmi_buf = perf_trace_buf_nmi;
+ rcu_assign_pointer(perf_trace_buf_nmi, NULL);
/*
* Ensure every events in profiling have finished before
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index cf3cabf6ce1..1d18315dc83 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -93,9 +93,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)
}
EXPORT_SYMBOL_GPL(trace_define_common_fields);
-#ifdef CONFIG_MODULES
-
-static void trace_destroy_fields(struct ftrace_event_call *call)
+void trace_destroy_fields(struct ftrace_event_call *call)
{
struct ftrace_event_field *field, *next;
@@ -107,8 +105,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
}
}
-#endif /* CONFIG_MODULES */
-
static void ftrace_event_enable_disable(struct ftrace_event_call *call,
int enable)
{
@@ -117,14 +113,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
if (call->enabled) {
call->enabled = 0;
tracing_stop_cmdline_record();
- call->unregfunc(call->data);
+ call->unregfunc(call);
}
break;
case 1:
if (!call->enabled) {
call->enabled = 1;
tracing_start_cmdline_record();
- call->regfunc(call->data);
+ call->regfunc(call);
}
break;
}
@@ -878,9 +874,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
"'%s/filter' entry\n", name);
}
- entry = trace_create_file("enable", 0644, system->entry,
- (void *)system->name,
- &ftrace_system_enable_fops);
+ trace_create_file("enable", 0644, system->entry,
+ (void *)system->name,
+ &ftrace_system_enable_fops);
return system->entry;
}
@@ -892,7 +888,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
const struct file_operations *filter,
const struct file_operations *format)
{
- struct dentry *entry;
int ret;
/*
@@ -910,12 +905,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
}
if (call->regfunc)
- entry = trace_create_file("enable", 0644, call->dir, call,
- enable);
+ trace_create_file("enable", 0644, call->dir, call,
+ enable);
if (call->id && call->profile_enable)
- entry = trace_create_file("id", 0444, call->dir, call,
- id);
+ trace_create_file("id", 0444, call->dir, call,
+ id);
if (call->define_fields) {
ret = call->define_fields(call);
@@ -924,41 +919,60 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
" events/%s\n", call->name);
return ret;
}
- entry = trace_create_file("filter", 0644, call->dir, call,
- filter);
+ trace_create_file("filter", 0644, call->dir, call,
+ filter);
}
/* A trace may not want to export its format */
if (!call->show_format)
return 0;
- entry = trace_create_file("format", 0444, call->dir, call,
- format);
+ trace_create_file("format", 0444, call->dir, call,
+ format);
return 0;
}
-#define for_each_event(event, start, end) \
- for (event = start; \
- (unsigned long)event < (unsigned long)end; \
- event++)
+static int __trace_add_event_call(struct ftrace_event_call *call)
+{
+ struct dentry *d_events;
+ int ret;
-#ifdef CONFIG_MODULES
+ if (!call->name)
+ return -EINVAL;
-static LIST_HEAD(ftrace_module_file_list);
+ if (call->raw_init) {
+ ret = call->raw_init(call);
+ if (ret < 0) {
+ if (ret != -ENOSYS)
+ pr_warning("Could not initialize trace "
+ "events/%s\n", call->name);
+ return ret;
+ }
+ }
-/*
- * Modules must own their file_operations to keep up with
- * reference counting.
- */
-struct ftrace_module_file_ops {
- struct list_head list;
- struct module *mod;
- struct file_operations id;
- struct file_operations enable;
- struct file_operations format;
- struct file_operations filter;
-};
+ d_events = event_trace_events_dir();
+ if (!d_events)
+ return -ENOENT;
+
+ ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
+ &ftrace_enable_fops, &ftrace_event_filter_fops,
+ &ftrace_event_format_fops);
+ if (!ret)
+ list_add(&call->list, &ftrace_events);
+
+ return ret;
+}
+
+/* Add an additional event_call dynamically */
+int trace_add_event_call(struct ftrace_event_call *call)
+{
+ int ret;
+ mutex_lock(&event_mutex);
+ ret = __trace_add_event_call(call);
+ mutex_unlock(&event_mutex);
+ return ret;
+}
static void remove_subsystem_dir(const char *name)
{
@@ -986,6 +1000,53 @@ static void remove_subsystem_dir(const char *name)
}
}
+/*
+ * Must be called under locking both of event_mutex and trace_event_mutex.
+ */
+static void __trace_remove_event_call(struct ftrace_event_call *call)
+{
+ ftrace_event_enable_disable(call, 0);
+ if (call->event)
+ __unregister_ftrace_event(call->event);
+ debugfs_remove_recursive(call->dir);
+ list_del(&call->list);
+ trace_destroy_fields(call);
+ destroy_preds(call);
+ remove_subsystem_dir(call->system);
+}
+
+/* Remove an event_call */
+void trace_remove_event_call(struct ftrace_event_call *call)
+{
+ mutex_lock(&event_mutex);
+ down_write(&trace_event_mutex);
+ __trace_remove_event_call(call);
+ up_write(&trace_event_mutex);
+ mutex_unlock(&event_mutex);
+}
+
+#define for_each_event(event, start, end) \
+ for (event = start; \
+ (unsigned long)event < (unsigned long)end; \
+ event++)
+
+#ifdef CONFIG_MODULES
+
+static LIST_HEAD(ftrace_module_file_list);
+
+/*
+ * Modules must own their file_operations to keep up with
+ * reference counting.
+ */
+struct ftrace_module_file_ops {
+ struct list_head list;
+ struct module *mod;
+ struct file_operations id;
+ struct file_operations enable;
+ struct file_operations format;
+ struct file_operations filter;
+};
+
static struct ftrace_module_file_ops *
trace_create_file_ops(struct module *mod)
{
@@ -1043,7 +1104,7 @@ static void trace_module_add_events(struct module *mod)
if (!call->name)
continue;
if (call->raw_init) {
- ret = call->raw_init();
+ ret = call->raw_init(call);
if (ret < 0) {
if (ret != -ENOSYS)
pr_warning("Could not initialize trace "
@@ -1061,10 +1122,11 @@ static void trace_module_add_events(struct module *mod)
return;
}
call->mod = mod;
- list_add(&call->list, &ftrace_events);
- event_create_dir(call, d_events,
- &file_ops->id, &file_ops->enable,
- &file_ops->filter, &file_ops->format);
+ ret = event_create_dir(call, d_events,
+ &file_ops->id, &file_ops->enable,
+ &file_ops->filter, &file_ops->format);
+ if (!ret)
+ list_add(&call->list, &ftrace_events);
}
}
@@ -1078,14 +1140,7 @@ static void trace_module_remove_events(struct module *mod)
list_for_each_entry_safe(call, p, &ftrace_events, list) {
if (call->mod == mod) {
found = true;
- ftrace_event_enable_disable(call, 0);
- if (call->event)
- __unregister_ftrace_event(call->event);
- debugfs_remove_recursive(call->dir);
- list_del(&call->list);
- trace_destroy_fields(call);
- destroy_preds(call);
- remove_subsystem_dir(call->system);
+ __trace_remove_event_call(call);
}
}
@@ -1203,7 +1258,7 @@ static __init int event_trace_init(void)
if (!call->name)
continue;
if (call->raw_init) {
- ret = call->raw_init();
+ ret = call->raw_init(call);
if (ret < 0) {
if (ret != -ENOSYS)
pr_warning("Could not initialize trace "
@@ -1211,10 +1266,12 @@ static __init int event_trace_init(void)
continue;
}
}
- list_add(&call->list, &ftrace_events);
- event_create_dir(call, d_events, &ftrace_event_id_fops,
- &ftrace_enable_fops, &ftrace_event_filter_fops,
- &ftrace_event_format_fops);
+ ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
+ &ftrace_enable_fops,
+ &ftrace_event_filter_fops,
+ &ftrace_event_format_fops);
+ if (!ret)
+ list_add(&call->list, &ftrace_events);
}
while (true) {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 98a6cc5c64e..50504cb228d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -18,11 +18,10 @@
* Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
*/
-#include <linux/debugfs.h>
-#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/mutex.h>
+#include <linux/perf_event.h>
#include "trace.h"
#include "trace_output.h"
@@ -31,6 +30,7 @@ enum filter_op_ids
{
OP_OR,
OP_AND,
+ OP_GLOB,
OP_NE,
OP_EQ,
OP_LT,
@@ -48,16 +48,17 @@ struct filter_op {
};
static struct filter_op filter_ops[] = {
- { OP_OR, "||", 1 },
- { OP_AND, "&&", 2 },
- { OP_NE, "!=", 4 },
- { OP_EQ, "==", 4 },
- { OP_LT, "<", 5 },
- { OP_LE, "<=", 5 },
- { OP_GT, ">", 5 },
- { OP_GE, ">=", 5 },
- { OP_NONE, "OP_NONE", 0 },
- { OP_OPEN_PAREN, "(", 0 },
+ { OP_OR, "||", 1 },
+ { OP_AND, "&&", 2 },
+ { OP_GLOB, "~", 4 },
+ { OP_NE, "!=", 4 },
+ { OP_EQ, "==", 4 },
+ { OP_LT, "<", 5 },
+ { OP_LE, "<=", 5 },
+ { OP_GT, ">", 5 },
+ { OP_GE, ">=", 5 },
+ { OP_NONE, "OP_NONE", 0 },
+ { OP_OPEN_PAREN, "(", 0 },
};
enum {
@@ -197,9 +198,9 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
char *addr = (char *)(event + pred->offset);
int cmp, match;
- cmp = strncmp(addr, pred->str_val, pred->str_len);
+ cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len);
- match = (!cmp) ^ pred->not;
+ match = cmp ^ pred->not;
return match;
}
@@ -211,9 +212,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
char **addr = (char **)(event + pred->offset);
int cmp, match;
- cmp = strncmp(*addr, pred->str_val, pred->str_len);
+ cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len);
- match = (!cmp) ^ pred->not;
+ match = cmp ^ pred->not;
return match;
}
@@ -237,9 +238,9 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
char *addr = (char *)(event + str_loc);
int cmp, match;
- cmp = strncmp(addr, pred->str_val, str_len);
+ cmp = pred->regex.match(addr, &pred->regex, str_len);
- match = (!cmp) ^ pred->not;
+ match = cmp ^ pred->not;
return match;
}
@@ -250,10 +251,121 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
return 0;
}
+/* Basic regex callbacks */
+static int regex_match_full(char *str, struct regex *r, int len)
+{
+ if (strncmp(str, r->pattern, len) == 0)
+ return 1;
+ return 0;
+}
+
+static int regex_match_front(char *str, struct regex *r, int len)
+{
+ if (strncmp(str, r->pattern, len) == 0)
+ return 1;
+ return 0;
+}
+
+static int regex_match_middle(char *str, struct regex *r, int len)
+{
+ if (strstr(str, r->pattern))
+ return 1;
+ return 0;
+}
+
+static int regex_match_end(char *str, struct regex *r, int len)
+{
+ char *ptr = strstr(str, r->pattern);
+
+ if (ptr && (ptr[r->len] == 0))
+ return 1;
+ return 0;
+}
+
+/**
+ * filter_parse_regex - parse a basic regex
+ * @buff: the raw regex
+ * @len: length of the regex
+ * @search: will point to the beginning of the string to compare
+ * @not: tell whether the match will have to be inverted
+ *
+ * This passes in a buffer containing a regex and this function will
+ * set search to point to the search part of the buffer and
+ * return the type of search it is (see enum above).
+ * This does modify buff.
+ *
+ * Returns enum type.
+ * search returns the pointer to use for comparison.
+ * not returns 1 if buff started with a '!'
+ * 0 otherwise.
+ */
+enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
+{
+ int type = MATCH_FULL;
+ int i;
+
+ if (buff[0] == '!') {
+ *not = 1;
+ buff++;
+ len--;
+ } else
+ *not = 0;
+
+ *search = buff;
+
+ for (i = 0; i < len; i++) {
+ if (buff[i] == '*') {
+ if (!i) {
+ *search = buff + 1;
+ type = MATCH_END_ONLY;
+ } else {
+ if (type == MATCH_END_ONLY)
+ type = MATCH_MIDDLE_ONLY;
+ else
+ type = MATCH_FRONT_ONLY;
+ buff[i] = 0;
+ break;
+ }
+ }
+ }
+
+ return type;
+}
+
+static void filter_build_regex(struct filter_pred *pred)
+{
+ struct regex *r = &pred->regex;
+ char *search;
+ enum regex_type type = MATCH_FULL;
+ int not = 0;
+
+ if (pred->op == OP_GLOB) {
+ type = filter_parse_regex(r->pattern, r->len, &search, &not);
+ r->len = strlen(search);
+ memmove(r->pattern, search, r->len+1);
+ }
+
+ switch (type) {
+ case MATCH_FULL:
+ r->match = regex_match_full;
+ break;
+ case MATCH_FRONT_ONLY:
+ r->match = regex_match_front;
+ break;
+ case MATCH_MIDDLE_ONLY:
+ r->match = regex_match_middle;
+ break;
+ case MATCH_END_ONLY:
+ r->match = regex_match_end;
+ break;
+ }
+
+ pred->not ^= not;
+}
+
/* return 1 if event matches, 0 otherwise (discard) */
-int filter_match_preds(struct ftrace_event_call *call, void *rec)
+int filter_match_preds(struct event_filter *filter, void *rec)
{
- struct event_filter *filter = call->filter;
int match, top = 0, val1 = 0, val2 = 0;
int stack[MAX_FILTER_PRED];
struct filter_pred *pred;
@@ -396,7 +508,7 @@ static void filter_clear_pred(struct filter_pred *pred)
{
kfree(pred->field_name);
pred->field_name = NULL;
- pred->str_len = 0;
+ pred->regex.len = 0;
}
static int filter_set_pred(struct filter_pred *dest,
@@ -426,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call)
filter->preds[i]->fn = filter_pred_none;
}
-void destroy_preds(struct ftrace_event_call *call)
+static void __free_preds(struct event_filter *filter)
{
- struct event_filter *filter = call->filter;
int i;
if (!filter)
@@ -441,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call)
kfree(filter->preds);
kfree(filter->filter_string);
kfree(filter);
+}
+
+void destroy_preds(struct ftrace_event_call *call)
+{
+ __free_preds(call->filter);
call->filter = NULL;
+ call->filter_active = 0;
}
-static int init_preds(struct ftrace_event_call *call)
+static struct event_filter *__alloc_preds(void)
{
struct event_filter *filter;
struct filter_pred *pred;
int i;
- if (call->filter)
- return 0;
-
- filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
- if (!call->filter)
- return -ENOMEM;
+ filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ if (!filter)
+ return ERR_PTR(-ENOMEM);
filter->n_preds = 0;
@@ -471,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call)
filter->preds[i] = pred;
}
- return 0;
+ return filter;
oom:
- destroy_preds(call);
+ __free_preds(filter);
+ return ERR_PTR(-ENOMEM);
+}
- return -ENOMEM;
+static int init_preds(struct ftrace_event_call *call)
+{
+ if (call->filter)
+ return 0;
+
+ call->filter_active = 0;
+ call->filter = __alloc_preds();
+ if (IS_ERR(call->filter))
+ return PTR_ERR(call->filter);
+
+ return 0;
}
static int init_subsystem_preds(struct event_subsystem *system)
@@ -499,14 +625,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
return 0;
}
-enum {
- FILTER_DISABLE_ALL,
- FILTER_INIT_NO_RESET,
- FILTER_SKIP_NO_RESET,
-};
-
-static void filter_free_subsystem_preds(struct event_subsystem *system,
- int flag)
+static void filter_free_subsystem_preds(struct event_subsystem *system)
{
struct ftrace_event_call *call;
@@ -517,14 +636,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
if (strcmp(call->system, system->name) != 0)
continue;
- if (flag == FILTER_INIT_NO_RESET) {
- call->filter->no_reset = false;
- continue;
- }
-
- if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
- continue;
-
filter_disable_preds(call);
remove_filter_string(call->filter);
}
@@ -532,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
static int filter_add_pred_fn(struct filter_parse_state *ps,
struct ftrace_event_call *call,
+ struct event_filter *filter,
struct filter_pred *pred,
filter_pred_fn_t fn)
{
- struct event_filter *filter = call->filter;
int idx, err;
if (filter->n_preds == MAX_FILTER_PRED) {
@@ -550,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
return err;
filter->n_preds++;
- call->filter_active = 1;
return 0;
}
@@ -575,7 +685,10 @@ static bool is_string_field(struct ftrace_event_field *field)
static int is_legal_op(struct ftrace_event_field *field, int op)
{
- if (is_string_field(field) && (op != OP_EQ && op != OP_NE))
+ if (is_string_field(field) &&
+ (op != OP_EQ && op != OP_NE && op != OP_GLOB))
+ return 0;
+ if (!is_string_field(field) && op == OP_GLOB)
return 0;
return 1;
@@ -626,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
static int filter_add_pred(struct filter_parse_state *ps,
struct ftrace_event_call *call,
+ struct event_filter *filter,
struct filter_pred *pred,
bool dry_run)
{
@@ -660,21 +774,22 @@ static int filter_add_pred(struct filter_parse_state *ps,
}
if (is_string_field(field)) {
- pred->str_len = field->size;
+ filter_build_regex(pred);
- if (field->filter_type == FILTER_STATIC_STRING)
+ if (field->filter_type == FILTER_STATIC_STRING) {
fn = filter_pred_string;
- else if (field->filter_type == FILTER_DYN_STRING)
+ pred->regex.field_len = field->size;
+ } else if (field->filter_type == FILTER_DYN_STRING)
fn = filter_pred_strloc;
else {
fn = filter_pred_pchar;
- pred->str_len = strlen(pred->str_val);
+ pred->regex.field_len = strlen(pred->regex.pattern);
}
} else {
if (field->is_signed)
- ret = strict_strtoll(pred->str_val, 0, &val);
+ ret = strict_strtoll(pred->regex.pattern, 0, &val);
else
- ret = strict_strtoull(pred->str_val, 0, &val);
+ ret = strict_strtoull(pred->regex.pattern, 0, &val);
if (ret) {
parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
return -EINVAL;
@@ -694,45 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
add_pred_fn:
if (!dry_run)
- return filter_add_pred_fn(ps, call, pred, fn);
- return 0;
-}
-
-static int filter_add_subsystem_pred(struct filter_parse_state *ps,
- struct event_subsystem *system,
- struct filter_pred *pred,
- char *filter_string,
- bool dry_run)
-{
- struct ftrace_event_call *call;
- int err = 0;
- bool fail = true;
-
- list_for_each_entry(call, &ftrace_events, list) {
-
- if (!call->define_fields)
- continue;
-
- if (strcmp(call->system, system->name))
- continue;
-
- if (call->filter->no_reset)
- continue;
-
- err = filter_add_pred(ps, call, pred, dry_run);
- if (err)
- call->filter->no_reset = true;
- else
- fail = false;
-
- if (!dry_run)
- replace_filter_string(call->filter, filter_string);
- }
-
- if (fail) {
- parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
- return err;
- }
+ return filter_add_pred_fn(ps, call, filter, pred, fn);
return 0;
}
@@ -1045,8 +1122,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
return NULL;
}
- strcpy(pred->str_val, operand2);
- pred->str_len = strlen(operand2);
+ strcpy(pred->regex.pattern, operand2);
+ pred->regex.len = strlen(pred->regex.pattern);
pred->op = op;
@@ -1090,8 +1167,8 @@ static int check_preds(struct filter_parse_state *ps)
return 0;
}
-static int replace_preds(struct event_subsystem *system,
- struct ftrace_event_call *call,
+static int replace_preds(struct ftrace_event_call *call,
+ struct event_filter *filter,
struct filter_parse_state *ps,
char *filter_string,
bool dry_run)
@@ -1138,11 +1215,7 @@ static int replace_preds(struct event_subsystem *system,
add_pred:
if (!pred)
return -ENOMEM;
- if (call)
- err = filter_add_pred(ps, call, pred, false);
- else
- err = filter_add_subsystem_pred(ps, system, pred,
- filter_string, dry_run);
+ err = filter_add_pred(ps, call, filter, pred, dry_run);
filter_free_pred(pred);
if (err)
return err;
@@ -1153,10 +1226,50 @@ add_pred:
return 0;
}
-int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+static int replace_system_preds(struct event_subsystem *system,
+ struct filter_parse_state *ps,
+ char *filter_string)
{
+ struct ftrace_event_call *call;
+ bool fail = true;
int err;
+ list_for_each_entry(call, &ftrace_events, list) {
+ struct event_filter *filter = call->filter;
+
+ if (!call->define_fields)
+ continue;
+
+ if (strcmp(call->system, system->name) != 0)
+ continue;
+
+ /* try to see if the filter can be applied */
+ err = replace_preds(call, filter, ps, filter_string, true);
+ if (err)
+ continue;
+
+ /* really apply the filter */
+ filter_disable_preds(call);
+ err = replace_preds(call, filter, ps, filter_string, false);
+ if (err)
+ filter_disable_preds(call);
+ else {
+ call->filter_active = 1;
+ replace_filter_string(filter, filter_string);
+ }
+ fail = false;
+ }
+
+ if (fail) {
+ parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+{
+ int err;
struct filter_parse_state *ps;
mutex_lock(&event_mutex);
@@ -1168,8 +1281,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
if (!strcmp(strstrip(filter_string), "0")) {
filter_disable_preds(call);
remove_filter_string(call->filter);
- mutex_unlock(&event_mutex);
- return 0;
+ goto out_unlock;
}
err = -ENOMEM;
@@ -1187,10 +1299,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
goto out;
}
- err = replace_preds(NULL, call, ps, filter_string, false);
+ err = replace_preds(call, call->filter, ps, filter_string, false);
if (err)
append_filter_err(ps, call->filter);
-
+ else
+ call->filter_active = 1;
out:
filter_opstack_clear(ps);
postfix_clear(ps);
@@ -1205,7 +1318,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
char *filter_string)
{
int err;
-
struct filter_parse_state *ps;
mutex_lock(&event_mutex);
@@ -1215,10 +1327,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
goto out_unlock;
if (!strcmp(strstrip(filter_string), "0")) {
- filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
+ filter_free_subsystem_preds(system);
remove_filter_string(system->filter);
- mutex_unlock(&event_mutex);
- return 0;
+ goto out_unlock;
}
err = -ENOMEM;
@@ -1235,31 +1346,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
goto out;
}
- filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
-
- /* try to see the filter can be applied to which events */
- err = replace_preds(system, NULL, ps, filter_string, true);
- if (err) {
+ err = replace_system_preds(system, ps, filter_string);
+ if (err)
append_filter_err(ps, system->filter);
- goto out;
+
+out:
+ filter_opstack_clear(ps);
+ postfix_clear(ps);
+ kfree(ps);
+out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return err;
+}
+
+#ifdef CONFIG_EVENT_PROFILE
+
+void ftrace_profile_free_filter(struct perf_event *event)
+{
+ struct event_filter *filter = event->filter;
+
+ event->filter = NULL;
+ __free_preds(filter);
+}
+
+int ftrace_profile_set_filter(struct perf_event *event, int event_id,
+ char *filter_str)
+{
+ int err;
+ struct event_filter *filter;
+ struct filter_parse_state *ps;
+ struct ftrace_event_call *call = NULL;
+
+ mutex_lock(&event_mutex);
+
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (call->id == event_id)
+ break;
}
- filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
+ err = -EINVAL;
+ if (!call)
+ goto out_unlock;
- /* really apply the filter to the events */
- err = replace_preds(system, NULL, ps, filter_string, false);
- if (err) {
- append_filter_err(ps, system->filter);
- filter_free_subsystem_preds(system, 2);
+ err = -EEXIST;
+ if (event->filter)
+ goto out_unlock;
+
+ filter = __alloc_preds();
+ if (IS_ERR(filter)) {
+ err = PTR_ERR(filter);
+ goto out_unlock;
}
-out:
+ err = -ENOMEM;
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+ if (!ps)
+ goto free_preds;
+
+ parse_init(ps, filter_ops, filter_str);
+ err = filter_parse(ps);
+ if (err)
+ goto free_ps;
+
+ err = replace_preds(call, filter, ps, filter_str, false);
+ if (!err)
+ event->filter = filter;
+
+free_ps:
filter_opstack_clear(ps);
postfix_clear(ps);
kfree(ps);
+
+free_preds:
+ if (err)
+ __free_preds(filter);
+
out_unlock:
mutex_unlock(&event_mutex);
return err;
}
+#endif /* CONFIG_EVENT_PROFILE */
+
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 31da218ee10..934d81fb4ca 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -134,7 +134,6 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
#include "trace_entries.h"
-
#undef __field
#define __field(type, item) \
ret = trace_define_field(event_call, #type, #item, \
@@ -196,6 +195,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
#include "trace_entries.h"
+static int ftrace_raw_init_event(struct ftrace_event_call *call)
+{
+ INIT_LIST_HEAD(&call->fields);
+ return 0;
+}
#undef __field
#define __field(type, item)
@@ -214,7 +218,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
-static int ftrace_raw_init_event_##call(void); \
\
struct ftrace_event_call __used \
__attribute__((__aligned__(4))) \
@@ -222,14 +225,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
.name = #call, \
.id = type, \
.system = __stringify(TRACE_SYSTEM), \
- .raw_init = ftrace_raw_init_event_##call, \
+ .raw_init = ftrace_raw_init_event, \
.show_format = ftrace_format_##call, \
.define_fields = ftrace_define_fields_##call, \
}; \
-static int ftrace_raw_init_event_##call(void) \
-{ \
- INIT_LIST_HEAD(&event_##call.fields); \
- return 0; \
-} \
#include "trace_entries.h"
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644
index 00000000000..aff5f80b59b
--- /dev/null
+++ b/kernel/trace/trace_kprobe.c
@@ -0,0 +1,1523 @@
+/*
+ * Kprobes-based tracing events
+ *
+ * Created by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/kprobes.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/debugfs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/ptrace.h>
+#include <linux/perf_event.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+#define MAX_TRACE_ARGS 128
+#define MAX_ARGSTR_LEN 63
+#define MAX_EVENT_NAME_LEN 64
+#define KPROBE_EVENT_SYSTEM "kprobes"
+
+/* Reserved field names */
+#define FIELD_STRING_IP "__probe_ip"
+#define FIELD_STRING_NARGS "__probe_nargs"
+#define FIELD_STRING_RETIP "__probe_ret_ip"
+#define FIELD_STRING_FUNC "__probe_func"
+
+const char *reserved_field_names[] = {
+ "common_type",
+ "common_flags",
+ "common_preempt_count",
+ "common_pid",
+ "common_tgid",
+ "common_lock_depth",
+ FIELD_STRING_IP,
+ FIELD_STRING_NARGS,
+ FIELD_STRING_RETIP,
+ FIELD_STRING_FUNC,
+};
+
+struct fetch_func {
+ unsigned long (*func)(struct pt_regs *, void *);
+ void *data;
+};
+
+static __kprobes unsigned long call_fetch(struct fetch_func *f,
+ struct pt_regs *regs)
+{
+ return f->func(regs, f->data);
+}
+
+/* fetch handlers */
+static __kprobes unsigned long fetch_register(struct pt_regs *regs,
+ void *offset)
+{
+ return regs_get_register(regs, (unsigned int)((unsigned long)offset));
+}
+
+static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
+ void *num)
+{
+ return regs_get_kernel_stack_nth(regs,
+ (unsigned int)((unsigned long)num));
+}
+
+static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
+{
+ unsigned long retval;
+
+ if (probe_kernel_address(addr, retval))
+ return 0;
+ return retval;
+}
+
+static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
+{
+ return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
+}
+
+static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
+ void *dummy)
+{
+ return regs_return_value(regs);
+}
+
+static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
+ void *dummy)
+{
+ return kernel_stack_pointer(regs);
+}
+
+/* Memory fetching by symbol */
+struct symbol_cache {
+ char *symbol;
+ long offset;
+ unsigned long addr;
+};
+
+static unsigned long update_symbol_cache(struct symbol_cache *sc)
+{
+ sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
+ if (sc->addr)
+ sc->addr += sc->offset;
+ return sc->addr;
+}
+
+static void free_symbol_cache(struct symbol_cache *sc)
+{
+ kfree(sc->symbol);
+ kfree(sc);
+}
+
+static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
+{
+ struct symbol_cache *sc;
+
+ if (!sym || strlen(sym) == 0)
+ return NULL;
+ sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
+ if (!sc)
+ return NULL;
+
+ sc->symbol = kstrdup(sym, GFP_KERNEL);
+ if (!sc->symbol) {
+ kfree(sc);
+ return NULL;
+ }
+ sc->offset = offset;
+
+ update_symbol_cache(sc);
+ return sc;
+}
+
+static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
+{
+ struct symbol_cache *sc = data;
+
+ if (sc->addr)
+ return fetch_memory(regs, (void *)sc->addr);
+ else
+ return 0;
+}
+
+/* Special indirect memory access interface */
+struct indirect_fetch_data {
+ struct fetch_func orig;
+ long offset;
+};
+
+static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
+{
+ struct indirect_fetch_data *ind = data;
+ unsigned long addr;
+
+ addr = call_fetch(&ind->orig, regs);
+ if (addr) {
+ addr += ind->offset;
+ return fetch_memory(regs, (void *)addr);
+ } else
+ return 0;
+}
+
+static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
+{
+ if (data->orig.func == fetch_indirect)
+ free_indirect_fetch_data(data->orig.data);
+ else if (data->orig.func == fetch_symbol)
+ free_symbol_cache(data->orig.data);
+ kfree(data);
+}
+
+/**
+ * Kprobe event core functions
+ */
+
+struct probe_arg {
+ struct fetch_func fetch;
+ const char *name;
+};
+
+/* Flags for trace_probe */
+#define TP_FLAG_TRACE 1
+#define TP_FLAG_PROFILE 2
+
+struct trace_probe {
+ struct list_head list;
+ struct kretprobe rp; /* Use rp.kp for kprobe use */
+ unsigned long nhit;
+ unsigned int flags; /* For TP_FLAG_* */
+ const char *symbol; /* symbol name */
+ struct ftrace_event_call call;
+ struct trace_event event;
+ unsigned int nr_args;
+ struct probe_arg args[];
+};
+
+#define SIZEOF_TRACE_PROBE(n) \
+ (offsetof(struct trace_probe, args) + \
+ (sizeof(struct probe_arg) * (n)))
+
+static __kprobes int probe_is_return(struct trace_probe *tp)
+{
+ return tp->rp.handler != NULL;
+}
+
+static __kprobes const char *probe_symbol(struct trace_probe *tp)
+{
+ return tp->symbol ? tp->symbol : "unknown";
+}
+
+static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
+{
+ int ret = -EINVAL;
+
+ if (ff->func == fetch_argument)
+ ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
+ else if (ff->func == fetch_register) {
+ const char *name;
+ name = regs_query_register_name((unsigned int)((long)ff->data));
+ ret = snprintf(buf, n, "%%%s", name);
+ } else if (ff->func == fetch_stack)
+ ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
+ else if (ff->func == fetch_memory)
+ ret = snprintf(buf, n, "@0x%p", ff->data);
+ else if (ff->func == fetch_symbol) {
+ struct symbol_cache *sc = ff->data;
+ if (sc->offset)
+ ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
+ sc->offset);
+ else
+ ret = snprintf(buf, n, "@%s", sc->symbol);
+ } else if (ff->func == fetch_retvalue)
+ ret = snprintf(buf, n, "$retval");
+ else if (ff->func == fetch_stack_address)
+ ret = snprintf(buf, n, "$stack");
+ else if (ff->func == fetch_indirect) {
+ struct indirect_fetch_data *id = ff->data;
+ size_t l = 0;
+ ret = snprintf(buf, n, "%+ld(", id->offset);
+ if (ret >= n)
+ goto end;
+ l += ret;
+ ret = probe_arg_string(buf + l, n - l, &id->orig);
+ if (ret < 0)
+ goto end;
+ l += ret;
+ ret = snprintf(buf + l, n - l, ")");
+ ret += l;
+ }
+end:
+ if (ret >= n)
+ return -ENOSPC;
+ return ret;
+}
+
+static int register_probe_event(struct trace_probe *tp);
+static void unregister_probe_event(struct trace_probe *tp);
+
+static DEFINE_MUTEX(probe_lock);
+static LIST_HEAD(probe_list);
+
+static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
+static int kretprobe_dispatcher(struct kretprobe_instance *ri,
+ struct pt_regs *regs);
+
+/*
+ * Allocate new trace_probe and initialize it (including kprobes).
+ */
+static struct trace_probe *alloc_trace_probe(const char *group,
+ const char *event,
+ void *addr,
+ const char *symbol,
+ unsigned long offs,
+ int nargs, int is_return)
+{
+ struct trace_probe *tp;
+
+ tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
+ if (!tp)
+ return ERR_PTR(-ENOMEM);
+
+ if (symbol) {
+ tp->symbol = kstrdup(symbol, GFP_KERNEL);
+ if (!tp->symbol)
+ goto error;
+ tp->rp.kp.symbol_name = tp->symbol;
+ tp->rp.kp.offset = offs;
+ } else
+ tp->rp.kp.addr = addr;
+
+ if (is_return)
+ tp->rp.handler = kretprobe_dispatcher;
+ else
+ tp->rp.kp.pre_handler = kprobe_dispatcher;
+
+ if (!event)
+ goto error;
+ tp->call.name = kstrdup(event, GFP_KERNEL);
+ if (!tp->call.name)
+ goto error;
+
+ if (!group)
+ goto error;
+ tp->call.system = kstrdup(group, GFP_KERNEL);
+ if (!tp->call.system)
+ goto error;
+
+ INIT_LIST_HEAD(&tp->list);
+ return tp;
+error:
+ kfree(tp->call.name);
+ kfree(tp->symbol);
+ kfree(tp);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void free_probe_arg(struct probe_arg *arg)
+{
+ if (arg->fetch.func == fetch_symbol)
+ free_symbol_cache(arg->fetch.data);
+ else if (arg->fetch.func == fetch_indirect)
+ free_indirect_fetch_data(arg->fetch.data);
+ kfree(arg->name);
+}
+
+static void free_trace_probe(struct trace_probe *tp)
+{
+ int i;
+
+ for (i = 0; i < tp->nr_args; i++)
+ free_probe_arg(&tp->args[i]);
+
+ kfree(tp->call.system);
+ kfree(tp->call.name);
+ kfree(tp->symbol);
+ kfree(tp);
+}
+
+static struct trace_probe *find_probe_event(const char *event,
+ const char *group)
+{
+ struct trace_probe *tp;
+
+ list_for_each_entry(tp, &probe_list, list)
+ if (strcmp(tp->call.name, event) == 0 &&
+ strcmp(tp->call.system, group) == 0)
+ return tp;
+ return NULL;
+}
+
+/* Unregister a trace_probe and probe_event: call with locking probe_lock */
+static void unregister_trace_probe(struct trace_probe *tp)
+{
+ if (probe_is_return(tp))
+ unregister_kretprobe(&tp->rp);
+ else
+ unregister_kprobe(&tp->rp.kp);
+ list_del(&tp->list);
+ unregister_probe_event(tp);
+}
+
+/* Register a trace_probe and probe_event */
+static int register_trace_probe(struct trace_probe *tp)
+{
+ struct trace_probe *old_tp;
+ int ret;
+
+ mutex_lock(&probe_lock);
+
+ /* register as an event */
+ old_tp = find_probe_event(tp->call.name, tp->call.system);
+ if (old_tp) {
+ /* delete old event */
+ unregister_trace_probe(old_tp);
+ free_trace_probe(old_tp);
+ }
+ ret = register_probe_event(tp);
+ if (ret) {
+ pr_warning("Faild to register probe event(%d)\n", ret);
+ goto end;
+ }
+
+ tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
+ if (probe_is_return(tp))
+ ret = register_kretprobe(&tp->rp);
+ else
+ ret = register_kprobe(&tp->rp.kp);
+
+ if (ret) {
+ pr_warning("Could not insert probe(%d)\n", ret);
+ if (ret == -EILSEQ) {
+ pr_warning("Probing address(0x%p) is not an "
+ "instruction boundary.\n",
+ tp->rp.kp.addr);
+ ret = -EINVAL;
+ }
+ unregister_probe_event(tp);
+ } else
+ list_add_tail(&tp->list, &probe_list);
+end:
+ mutex_unlock(&probe_lock);
+ return ret;
+}
+
+/* Split symbol and offset. */
+static int split_symbol_offset(char *symbol, unsigned long *offset)
+{
+ char *tmp;
+ int ret;
+
+ if (!offset)
+ return -EINVAL;
+
+ tmp = strchr(symbol, '+');
+ if (tmp) {
+ /* skip sign because strict_strtol doesn't accept '+' */
+ ret = strict_strtoul(tmp + 1, 0, offset);
+ if (ret)
+ return ret;
+ *tmp = '\0';
+ } else
+ *offset = 0;
+ return 0;
+}
+
+#define PARAM_MAX_ARGS 16
+#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
+
+static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
+{
+ int ret = 0;
+ unsigned long param;
+
+ if (strcmp(arg, "retval") == 0) {
+ if (is_return) {
+ ff->func = fetch_retvalue;
+ ff->data = NULL;
+ } else
+ ret = -EINVAL;
+ } else if (strncmp(arg, "stack", 5) == 0) {
+ if (arg[5] == '\0') {
+ ff->func = fetch_stack_address;
+ ff->data = NULL;
+ } else if (isdigit(arg[5])) {
+ ret = strict_strtoul(arg + 5, 10, &param);
+ if (ret || param > PARAM_MAX_STACK)
+ ret = -EINVAL;
+ else {
+ ff->func = fetch_stack;
+ ff->data = (void *)param;
+ }
+ } else
+ ret = -EINVAL;
+ } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
+ ret = strict_strtoul(arg + 3, 10, &param);
+ if (ret || param > PARAM_MAX_ARGS)
+ ret = -EINVAL;
+ else {
+ ff->func = fetch_argument;
+ ff->data = (void *)param;
+ }
+ } else
+ ret = -EINVAL;
+ return ret;
+}
+
+/* Recursive argument parser */
+static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+{
+ int ret = 0;
+ unsigned long param;
+ long offset;
+ char *tmp;
+
+ switch (arg[0]) {
+ case '$':
+ ret = parse_probe_vars(arg + 1, ff, is_return);
+ break;
+ case '%': /* named register */
+ ret = regs_query_register_offset(arg + 1);
+ if (ret >= 0) {
+ ff->func = fetch_register;
+ ff->data = (void *)(unsigned long)ret;
+ ret = 0;
+ }
+ break;
+ case '@': /* memory or symbol */
+ if (isdigit(arg[1])) {
+ ret = strict_strtoul(arg + 1, 0, &param);
+ if (ret)
+ break;
+ ff->func = fetch_memory;
+ ff->data = (void *)param;
+ } else {
+ ret = split_symbol_offset(arg + 1, &offset);
+ if (ret)
+ break;
+ ff->data = alloc_symbol_cache(arg + 1, offset);
+ if (ff->data)
+ ff->func = fetch_symbol;
+ else
+ ret = -EINVAL;
+ }
+ break;
+ case '+': /* indirect memory */
+ case '-':
+ tmp = strchr(arg, '(');
+ if (!tmp) {
+ ret = -EINVAL;
+ break;
+ }
+ *tmp = '\0';
+ ret = strict_strtol(arg + 1, 0, &offset);
+ if (ret)
+ break;
+ if (arg[0] == '-')
+ offset = -offset;
+ arg = tmp + 1;
+ tmp = strrchr(arg, ')');
+ if (tmp) {
+ struct indirect_fetch_data *id;
+ *tmp = '\0';
+ id = kzalloc(sizeof(struct indirect_fetch_data),
+ GFP_KERNEL);
+ if (!id)
+ return -ENOMEM;
+ id->offset = offset;
+ ret = __parse_probe_arg(arg, &id->orig, is_return);
+ if (ret)
+ kfree(id);
+ else {
+ ff->func = fetch_indirect;
+ ff->data = (void *)id;
+ }
+ } else
+ ret = -EINVAL;
+ break;
+ default:
+ /* TODO: support custom handler */
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+/* String length checking wrapper */
+static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+{
+ if (strlen(arg) > MAX_ARGSTR_LEN) {
+ pr_info("Argument is too long.: %s\n", arg);
+ return -ENOSPC;
+ }
+ return __parse_probe_arg(arg, ff, is_return);
+}
+
+/* Return 1 if name is reserved or already used by another argument */
+static int conflict_field_name(const char *name,
+ struct probe_arg *args, int narg)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
+ if (strcmp(reserved_field_names[i], name) == 0)
+ return 1;
+ for (i = 0; i < narg; i++)
+ if (strcmp(args[i].name, name) == 0)
+ return 1;
+ return 0;
+}
+
+static int create_trace_probe(int argc, char **argv)
+{
+ /*
+ * Argument syntax:
+ * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
+ * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
+ * Fetch args:
+ * $argN : fetch Nth of function argument. (N:0-)
+ * $retval : fetch return value
+ * $stack : fetch stack address
+ * $stackN : fetch Nth of stack (N:0-)
+ * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
+ * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
+ * %REG : fetch register REG
+ * Indirect memory fetch:
+ * +|-offs(ARG) : fetch memory at ARG +|- offs address.
+ * Alias name of args:
+ * NAME=FETCHARG : set NAME as alias of FETCHARG.
+ */
+ struct trace_probe *tp;
+ int i, ret = 0;
+ int is_return = 0;
+ char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
+ unsigned long offset = 0;
+ void *addr = NULL;
+ char buf[MAX_EVENT_NAME_LEN];
+
+ if (argc < 2) {
+ pr_info("Probe point is not specified.\n");
+ return -EINVAL;
+ }
+
+ if (argv[0][0] == 'p')
+ is_return = 0;
+ else if (argv[0][0] == 'r')
+ is_return = 1;
+ else {
+ pr_info("Probe definition must be started with 'p' or 'r'.\n");
+ return -EINVAL;
+ }
+
+ if (argv[0][1] == ':') {
+ event = &argv[0][2];
+ if (strchr(event, '/')) {
+ group = event;
+ event = strchr(group, '/') + 1;
+ event[-1] = '\0';
+ if (strlen(group) == 0) {
+ pr_info("Group name is not specifiled\n");
+ return -EINVAL;
+ }
+ }
+ if (strlen(event) == 0) {
+ pr_info("Event name is not specifiled\n");
+ return -EINVAL;
+ }
+ }
+
+ if (isdigit(argv[1][0])) {
+ if (is_return) {
+ pr_info("Return probe point must be a symbol.\n");
+ return -EINVAL;
+ }
+ /* an address specified */
+ ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
+ if (ret) {
+ pr_info("Failed to parse address.\n");
+ return ret;
+ }
+ } else {
+ /* a symbol specified */
+ symbol = argv[1];
+ /* TODO: support .init module functions */
+ ret = split_symbol_offset(symbol, &offset);
+ if (ret) {
+ pr_info("Failed to parse symbol.\n");
+ return ret;
+ }
+ if (offset && is_return) {
+ pr_info("Return probe must be used without offset.\n");
+ return -EINVAL;
+ }
+ }
+ argc -= 2; argv += 2;
+
+ /* setup a probe */
+ if (!group)
+ group = KPROBE_EVENT_SYSTEM;
+ if (!event) {
+ /* Make a new event name */
+ if (symbol)
+ snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld",
+ is_return ? 'r' : 'p', symbol, offset);
+ else
+ snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p",
+ is_return ? 'r' : 'p', addr);
+ event = buf;
+ }
+ tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
+ is_return);
+ if (IS_ERR(tp)) {
+ pr_info("Failed to allocate trace_probe.(%d)\n",
+ (int)PTR_ERR(tp));
+ return PTR_ERR(tp);
+ }
+
+ /* parse arguments */
+ ret = 0;
+ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ /* Parse argument name */
+ arg = strchr(argv[i], '=');
+ if (arg)
+ *arg++ = '\0';
+ else
+ arg = argv[i];
+
+ if (conflict_field_name(argv[i], tp->args, i)) {
+ pr_info("Argument%d name '%s' conflicts with "
+ "another field.\n", i, argv[i]);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
+ if (!tp->args[i].name) {
+ pr_info("Failed to allocate argument%d name '%s'.\n",
+ i, argv[i]);
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ /* Parse fetch argument */
+ ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
+ if (ret) {
+ pr_info("Parse error at argument%d. (%d)\n", i, ret);
+ kfree(tp->args[i].name);
+ goto error;
+ }
+
+ tp->nr_args++;
+ }
+
+ ret = register_trace_probe(tp);
+ if (ret)
+ goto error;
+ return 0;
+
+error:
+ free_trace_probe(tp);
+ return ret;
+}
+
+static void cleanup_all_probes(void)
+{
+ struct trace_probe *tp;
+
+ mutex_lock(&probe_lock);
+ /* TODO: Use batch unregistration */
+ while (!list_empty(&probe_list)) {
+ tp = list_entry(probe_list.next, struct trace_probe, list);
+ unregister_trace_probe(tp);
+ free_trace_probe(tp);
+ }
+ mutex_unlock(&probe_lock);
+}
+
+
+/* Probes listing interfaces */
+static void *probes_seq_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&probe_lock);
+ return seq_list_start(&probe_list, *pos);
+}
+
+static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &probe_list, pos);
+}
+
+static void probes_seq_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&probe_lock);
+}
+
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+ struct trace_probe *tp = v;
+ int i, ret;
+ char buf[MAX_ARGSTR_LEN + 1];
+
+ seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
+ seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
+
+ if (!tp->symbol)
+ seq_printf(m, " 0x%p", tp->rp.kp.addr);
+ else if (tp->rp.kp.offset)
+ seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
+ else
+ seq_printf(m, " %s", probe_symbol(tp));
+
+ for (i = 0; i < tp->nr_args; i++) {
+ ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
+ if (ret < 0) {
+ pr_warning("Argument%d decoding error(%d).\n", i, ret);
+ return ret;
+ }
+ seq_printf(m, " %s=%s", tp->args[i].name, buf);
+ }
+ seq_printf(m, "\n");
+ return 0;
+}
+
+static const struct seq_operations probes_seq_op = {
+ .start = probes_seq_start,
+ .next = probes_seq_next,
+ .stop = probes_seq_stop,
+ .show = probes_seq_show
+};
+
+static int probes_open(struct inode *inode, struct file *file)
+{
+ if ((file->f_mode & FMODE_WRITE) &&
+ (file->f_flags & O_TRUNC))
+ cleanup_all_probes();
+
+ return seq_open(file, &probes_seq_op);
+}
+
+static int command_trace_probe(const char *buf)
+{
+ char **argv;
+ int argc = 0, ret = 0;
+
+ argv = argv_split(GFP_KERNEL, buf, &argc);
+ if (!argv)
+ return -ENOMEM;
+
+ if (argc)
+ ret = create_trace_probe(argc, argv);
+
+ argv_free(argv);
+ return ret;
+}
+
+#define WRITE_BUFSIZE 128
+
+static ssize_t probes_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ char *kbuf, *tmp;
+ int ret;
+ size_t done;
+ size_t size;
+
+ kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ ret = done = 0;
+ while (done < count) {
+ size = count - done;
+ if (size >= WRITE_BUFSIZE)
+ size = WRITE_BUFSIZE - 1;
+ if (copy_from_user(kbuf, buffer + done, size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ kbuf[size] = '\0';
+ tmp = strchr(kbuf, '\n');
+ if (tmp) {
+ *tmp = '\0';
+ size = tmp - kbuf + 1;
+ } else if (done + size < count) {
+ pr_warning("Line length is too long: "
+ "Should be less than %d.", WRITE_BUFSIZE);
+ ret = -EINVAL;
+ goto out;
+ }
+ done += size;
+ /* Remove comments */
+ tmp = strchr(kbuf, '#');
+ if (tmp)
+ *tmp = '\0';
+
+ ret = command_trace_probe(kbuf);
+ if (ret)
+ goto out;
+ }
+ ret = done;
+out:
+ kfree(kbuf);
+ return ret;
+}
+
+static const struct file_operations kprobe_events_ops = {
+ .owner = THIS_MODULE,
+ .open = probes_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = probes_write,
+};
+
+/* Probes profiling interfaces */
+static int probes_profile_seq_show(struct seq_file *m, void *v)
+{
+ struct trace_probe *tp = v;
+
+ seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
+ tp->rp.kp.nmissed);
+
+ return 0;
+}
+
+static const struct seq_operations profile_seq_op = {
+ .start = probes_seq_start,
+ .next = probes_seq_next,
+ .stop = probes_seq_stop,
+ .show = probes_profile_seq_show
+};
+
+static int profile_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &profile_seq_op);
+}
+
+static const struct file_operations kprobe_profile_ops = {
+ .owner = THIS_MODULE,
+ .open = profile_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/* Kprobe handler */
+static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+{
+ struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+ struct kprobe_trace_entry *entry;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ int size, i, pc;
+ unsigned long irq_flags;
+ struct ftrace_event_call *call = &tp->call;
+
+ tp->nhit++;
+
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
+ size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+
+ event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
+ irq_flags, pc);
+ if (!event)
+ return 0;
+
+ entry = ring_buffer_event_data(event);
+ entry->nargs = tp->nr_args;
+ entry->ip = (unsigned long)kp->addr;
+ for (i = 0; i < tp->nr_args; i++)
+ entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+
+ if (!filter_current_check_discard(buffer, call, entry, event))
+ trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+ return 0;
+}
+
+/* Kretprobe handler */
+static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
+ struct pt_regs *regs)
+{
+ struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+ struct kretprobe_trace_entry *entry;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ int size, i, pc;
+ unsigned long irq_flags;
+ struct ftrace_event_call *call = &tp->call;
+
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
+ size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+
+ event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
+ irq_flags, pc);
+ if (!event)
+ return 0;
+
+ entry = ring_buffer_event_data(event);
+ entry->nargs = tp->nr_args;
+ entry->func = (unsigned long)tp->rp.kp.addr;
+ entry->ret_ip = (unsigned long)ri->ret_addr;
+ for (i = 0; i < tp->nr_args; i++)
+ entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+
+ if (!filter_current_check_discard(buffer, call, entry, event))
+ trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+
+ return 0;
+}
+
+/* Event entry printers */
+enum print_line_t
+print_kprobe_event(struct trace_iterator *iter, int flags)
+{
+ struct kprobe_trace_entry *field;
+ struct trace_seq *s = &iter->seq;
+ struct trace_event *event;
+ struct trace_probe *tp;
+ int i;
+
+ field = (struct kprobe_trace_entry *)iter->ent;
+ event = ftrace_find_event(field->ent.type);
+ tp = container_of(event, struct trace_probe, event);
+
+ if (!trace_seq_printf(s, "%s: (", tp->call.name))
+ goto partial;
+
+ if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+ goto partial;
+
+ if (!trace_seq_puts(s, ")"))
+ goto partial;
+
+ for (i = 0; i < field->nargs; i++)
+ if (!trace_seq_printf(s, " %s=%lx",
+ tp->args[i].name, field->args[i]))
+ goto partial;
+
+ if (!trace_seq_puts(s, "\n"))
+ goto partial;
+
+ return TRACE_TYPE_HANDLED;
+partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+enum print_line_t
+print_kretprobe_event(struct trace_iterator *iter, int flags)
+{
+ struct kretprobe_trace_entry *field;
+ struct trace_seq *s = &iter->seq;
+ struct trace_event *event;
+ struct trace_probe *tp;
+ int i;
+
+ field = (struct kretprobe_trace_entry *)iter->ent;
+ event = ftrace_find_event(field->ent.type);
+ tp = container_of(event, struct trace_probe, event);
+
+ if (!trace_seq_printf(s, "%s: (", tp->call.name))
+ goto partial;
+
+ if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
+ goto partial;
+
+ if (!trace_seq_puts(s, " <- "))
+ goto partial;
+
+ if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
+ goto partial;
+
+ if (!trace_seq_puts(s, ")"))
+ goto partial;
+
+ for (i = 0; i < field->nargs; i++)
+ if (!trace_seq_printf(s, " %s=%lx",
+ tp->args[i].name, field->args[i]))
+ goto partial;
+
+ if (!trace_seq_puts(s, "\n"))
+ goto partial;
+
+ return TRACE_TYPE_HANDLED;
+partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int probe_event_enable(struct ftrace_event_call *call)
+{
+ struct trace_probe *tp = (struct trace_probe *)call->data;
+
+ tp->flags |= TP_FLAG_TRACE;
+ if (probe_is_return(tp))
+ return enable_kretprobe(&tp->rp);
+ else
+ return enable_kprobe(&tp->rp.kp);
+}
+
+static void probe_event_disable(struct ftrace_event_call *call)
+{
+ struct trace_probe *tp = (struct trace_probe *)call->data;
+
+ tp->flags &= ~TP_FLAG_TRACE;
+ if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
+ if (probe_is_return(tp))
+ disable_kretprobe(&tp->rp);
+ else
+ disable_kprobe(&tp->rp.kp);
+ }
+}
+
+static int probe_event_raw_init(struct ftrace_event_call *event_call)
+{
+ INIT_LIST_HEAD(&event_call->fields);
+
+ return 0;
+}
+
+#undef DEFINE_FIELD
+#define DEFINE_FIELD(type, item, name, is_signed) \
+ do { \
+ ret = trace_define_field(event_call, #type, name, \
+ offsetof(typeof(field), item), \
+ sizeof(field.item), is_signed, \
+ FILTER_OTHER); \
+ if (ret) \
+ return ret; \
+ } while (0)
+
+static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+ int ret, i;
+ struct kprobe_trace_entry field;
+ struct trace_probe *tp = (struct trace_probe *)event_call->data;
+
+ ret = trace_define_common_fields(event_call);
+ if (!ret)
+ return ret;
+
+ DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
+ DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
+ /* Set argument names as fields */
+ for (i = 0; i < tp->nr_args; i++)
+ DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+ return 0;
+}
+
+static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+ int ret, i;
+ struct kretprobe_trace_entry field;
+ struct trace_probe *tp = (struct trace_probe *)event_call->data;
+
+ ret = trace_define_common_fields(event_call);
+ if (!ret)
+ return ret;
+
+ DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
+ DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
+ DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
+ /* Set argument names as fields */
+ for (i = 0; i < tp->nr_args; i++)
+ DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+ return 0;
+}
+
+static int __probe_event_show_format(struct trace_seq *s,
+ struct trace_probe *tp, const char *fmt,
+ const char *arg)
+{
+ int i;
+
+ /* Show format */
+ if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
+ return 0;
+
+ for (i = 0; i < tp->nr_args; i++)
+ if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
+ return 0;
+
+ if (!trace_seq_printf(s, "\", %s", arg))
+ return 0;
+
+ for (i = 0; i < tp->nr_args; i++)
+ if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
+ return 0;
+
+ return trace_seq_puts(s, "\n");
+}
+
+#undef SHOW_FIELD
+#define SHOW_FIELD(type, item, name) \
+ do { \
+ ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \
+ "offset:%u;\tsize:%u;\n", name, \
+ (unsigned int)offsetof(typeof(field), item),\
+ (unsigned int)sizeof(type)); \
+ if (!ret) \
+ return 0; \
+ } while (0)
+
+static int kprobe_event_show_format(struct ftrace_event_call *call,
+ struct trace_seq *s)
+{
+ struct kprobe_trace_entry field __attribute__((unused));
+ int ret, i;
+ struct trace_probe *tp = (struct trace_probe *)call->data;
+
+ SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
+ SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
+
+ /* Show fields */
+ for (i = 0; i < tp->nr_args; i++)
+ SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
+ trace_seq_puts(s, "\n");
+
+ return __probe_event_show_format(s, tp, "(%lx)",
+ "REC->" FIELD_STRING_IP);
+}
+
+static int kretprobe_event_show_format(struct ftrace_event_call *call,
+ struct trace_seq *s)
+{
+ struct kretprobe_trace_entry field __attribute__((unused));
+ int ret, i;
+ struct trace_probe *tp = (struct trace_probe *)call->data;
+
+ SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC);
+ SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP);
+ SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
+
+ /* Show fields */
+ for (i = 0; i < tp->nr_args; i++)
+ SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
+ trace_seq_puts(s, "\n");
+
+ return __probe_event_show_format(s, tp, "(%lx <- %lx)",
+ "REC->" FIELD_STRING_FUNC
+ ", REC->" FIELD_STRING_RETIP);
+}
+
+#ifdef CONFIG_EVENT_PROFILE
+
+/* Kprobe profile handler */
+static __kprobes int kprobe_profile_func(struct kprobe *kp,
+ struct pt_regs *regs)
+{
+ struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+ struct ftrace_event_call *call = &tp->call;
+ struct kprobe_trace_entry *entry;
+ struct trace_entry *ent;
+ int size, __size, i, pc, __cpu;
+ unsigned long irq_flags;
+ char *trace_buf;
+ char *raw_data;
+ int rctx;
+
+ pc = preempt_count();
+ __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+ size = ALIGN(__size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+ if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+ "profile buffer not large enough"))
+ return 0;
+
+ /*
+ * Protect the non nmi buffer
+ * This also protects the rcu read side
+ */
+ local_irq_save(irq_flags);
+
+ rctx = perf_swevent_get_recursion_context();
+ if (rctx < 0)
+ goto end_recursion;
+
+ __cpu = smp_processor_id();
+
+ if (in_nmi())
+ trace_buf = rcu_dereference(perf_trace_buf_nmi);
+ else
+ trace_buf = rcu_dereference(perf_trace_buf);
+
+ if (!trace_buf)
+ goto end;
+
+ raw_data = per_cpu_ptr(trace_buf, __cpu);
+
+ /* Zero dead bytes from alignment to avoid buffer leak to userspace */
+ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+ entry = (struct kprobe_trace_entry *)raw_data;
+ ent = &entry->ent;
+
+ tracing_generic_entry_update(ent, irq_flags, pc);
+ ent->type = call->id;
+ entry->nargs = tp->nr_args;
+ entry->ip = (unsigned long)kp->addr;
+ for (i = 0; i < tp->nr_args; i++)
+ entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+ perf_tp_event(call->id, entry->ip, 1, entry, size);
+
+end:
+ perf_swevent_put_recursion_context(rctx);
+end_recursion:
+ local_irq_restore(irq_flags);
+
+ return 0;
+}
+
+/* Kretprobe profile handler */
+static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
+ struct pt_regs *regs)
+{
+ struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+ struct ftrace_event_call *call = &tp->call;
+ struct kretprobe_trace_entry *entry;
+ struct trace_entry *ent;
+ int size, __size, i, pc, __cpu;
+ unsigned long irq_flags;
+ char *trace_buf;
+ char *raw_data;
+ int rctx;
+
+ pc = preempt_count();
+ __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+ size = ALIGN(__size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+ if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+ "profile buffer not large enough"))
+ return 0;
+
+ /*
+ * Protect the non nmi buffer
+ * This also protects the rcu read side
+ */
+ local_irq_save(irq_flags);
+
+ rctx = perf_swevent_get_recursion_context();
+ if (rctx < 0)
+ goto end_recursion;
+
+ __cpu = smp_processor_id();
+
+ if (in_nmi())
+ trace_buf = rcu_dereference(perf_trace_buf_nmi);
+ else
+ trace_buf = rcu_dereference(perf_trace_buf);
+
+ if (!trace_buf)
+ goto end;
+
+ raw_data = per_cpu_ptr(trace_buf, __cpu);
+
+ /* Zero dead bytes from alignment to avoid buffer leak to userspace */
+ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+ entry = (struct kretprobe_trace_entry *)raw_data;
+ ent = &entry->ent;
+
+ tracing_generic_entry_update(ent, irq_flags, pc);
+ ent->type = call->id;
+ entry->nargs = tp->nr_args;
+ entry->func = (unsigned long)tp->rp.kp.addr;
+ entry->ret_ip = (unsigned long)ri->ret_addr;
+ for (i = 0; i < tp->nr_args; i++)
+ entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+ perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
+
+end:
+ perf_swevent_put_recursion_context(rctx);
+end_recursion:
+ local_irq_restore(irq_flags);
+
+ return 0;
+}
+
+static int probe_profile_enable(struct ftrace_event_call *call)
+{
+ struct trace_probe *tp = (struct trace_probe *)call->data;
+
+ tp->flags |= TP_FLAG_PROFILE;
+
+ if (probe_is_return(tp))
+ return enable_kretprobe(&tp->rp);
+ else
+ return enable_kprobe(&tp->rp.kp);
+}
+
+static void probe_profile_disable(struct ftrace_event_call *call)
+{
+ struct trace_probe *tp = (struct trace_probe *)call->data;
+
+ tp->flags &= ~TP_FLAG_PROFILE;
+
+ if (!(tp->flags & TP_FLAG_TRACE)) {
+ if (probe_is_return(tp))
+ disable_kretprobe(&tp->rp);
+ else
+ disable_kprobe(&tp->rp.kp);
+ }
+}
+#endif /* CONFIG_EVENT_PROFILE */
+
+
+static __kprobes
+int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
+{
+ struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+
+ if (tp->flags & TP_FLAG_TRACE)
+ kprobe_trace_func(kp, regs);
+#ifdef CONFIG_EVENT_PROFILE
+ if (tp->flags & TP_FLAG_PROFILE)
+ kprobe_profile_func(kp, regs);
+#endif /* CONFIG_EVENT_PROFILE */
+ return 0; /* We don't tweek kernel, so just return 0 */
+}
+
+static __kprobes
+int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+ struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+
+ if (tp->flags & TP_FLAG_TRACE)
+ kretprobe_trace_func(ri, regs);
+#ifdef CONFIG_EVENT_PROFILE
+ if (tp->flags & TP_FLAG_PROFILE)
+ kretprobe_profile_func(ri, regs);
+#endif /* CONFIG_EVENT_PROFILE */
+ return 0; /* We don't tweek kernel, so just return 0 */
+}
+
+static int register_probe_event(struct trace_probe *tp)
+{
+ struct ftrace_event_call *call = &tp->call;
+ int ret;
+
+ /* Initialize ftrace_event_call */
+ if (probe_is_return(tp)) {
+ tp->event.trace = print_kretprobe_event;
+ call->raw_init = probe_event_raw_init;
+ call->show_format = kretprobe_event_show_format;
+ call->define_fields = kretprobe_event_define_fields;
+ } else {
+ tp->event.trace = print_kprobe_event;
+ call->raw_init = probe_event_raw_init;
+ call->show_format = kprobe_event_show_format;
+ call->define_fields = kprobe_event_define_fields;
+ }
+ call->event = &tp->event;
+ call->id = register_ftrace_event(&tp->event);
+ if (!call->id)
+ return -ENODEV;
+ call->enabled = 0;
+ call->regfunc = probe_event_enable;
+ call->unregfunc = probe_event_disable;
+
+#ifdef CONFIG_EVENT_PROFILE
+ atomic_set(&call->profile_count, -1);
+ call->profile_enable = probe_profile_enable;
+ call->profile_disable = probe_profile_disable;
+#endif
+ call->data = tp;
+ ret = trace_add_event_call(call);
+ if (ret) {
+ pr_info("Failed to register kprobe event: %s\n", call->name);
+ unregister_ftrace_event(&tp->event);
+ }
+ return ret;
+}
+
+static void unregister_probe_event(struct trace_probe *tp)
+{
+ /* tp->event is unregistered in trace_remove_event_call() */
+ trace_remove_event_call(&tp->call);
+}
+
+/* Make a debugfs interface for controling probe points */
+static __init int init_kprobe_trace(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
+
+ entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
+ NULL, &kprobe_events_ops);
+
+ /* Event list interface */
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'kprobe_events' entry\n");
+
+ /* Profile interface */
+ entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
+ NULL, &kprobe_profile_ops);
+
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'kprobe_profile' entry\n");
+ return 0;
+}
+fs_initcall(init_kprobe_trace);
+
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+static int kprobe_trace_selftest_target(int a1, int a2, int a3,
+ int a4, int a5, int a6)
+{
+ return a1 + a2 + a3 + a4 + a5 + a6;
+}
+
+static __init int kprobe_trace_self_tests_init(void)
+{
+ int ret;
+ int (*target)(int, int, int, int, int, int);
+
+ target = kprobe_trace_selftest_target;
+
+ pr_info("Testing kprobe tracing: ");
+
+ ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
+ "$arg1 $arg2 $arg3 $arg4 $stack $stack0");
+ if (WARN_ON_ONCE(ret))
+ pr_warning("error enabling function entry\n");
+
+ ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
+ "$retval");
+ if (WARN_ON_ONCE(ret))
+ pr_warning("error enabling function return\n");
+
+ ret = target(1, 2, 3, 4, 5, 6);
+
+ cleanup_all_probes();
+
+ pr_cont("OK\n");
+ return 0;
+}
+
+late_initcall(kprobe_trace_self_tests_init);
+
+#endif
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
new file mode 100644
index 00000000000..ddfa0fd43bc
--- /dev/null
+++ b/kernel/trace/trace_ksym.c
@@ -0,0 +1,550 @@
+/*
+ * trace_ksym.c - Kernel Symbol Tracer
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+
+#include "trace_output.h"
+#include "trace_stat.h"
+#include "trace.h"
+
+#include <linux/hw_breakpoint.h>
+#include <asm/hw_breakpoint.h>
+
+/*
+ * For now, let us restrict the no. of symbols traced simultaneously to number
+ * of available hardware breakpoint registers.
+ */
+#define KSYM_TRACER_MAX HBP_NUM
+
+#define KSYM_TRACER_OP_LEN 3 /* rw- */
+
+struct trace_ksym {
+ struct perf_event **ksym_hbp;
+ struct perf_event_attr attr;
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+ unsigned long counter;
+#endif
+ struct hlist_node ksym_hlist;
+};
+
+static struct trace_array *ksym_trace_array;
+
+static unsigned int ksym_filter_entry_count;
+static unsigned int ksym_tracing_enabled;
+
+static HLIST_HEAD(ksym_filter_head);
+
+static DEFINE_MUTEX(ksym_tracer_mutex);
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+
+#define MAX_UL_INT 0xffffffff
+
+void ksym_collect_stats(unsigned long hbp_hit_addr)
+{
+ struct hlist_node *node;
+ struct trace_ksym *entry;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
+ if ((entry->attr.bp_addr == hbp_hit_addr) &&
+ (entry->counter <= MAX_UL_INT)) {
+ entry->counter++;
+ break;
+ }
+ }
+ rcu_read_unlock();
+}
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+
+void ksym_hbp_handler(struct perf_event *hbp, void *data)
+{
+ struct ring_buffer_event *event;
+ struct ksym_trace_entry *entry;
+ struct pt_regs *regs = data;
+ struct ring_buffer *buffer;
+ int pc;
+
+ if (!ksym_tracing_enabled)
+ return;
+
+ buffer = ksym_trace_array->buffer;
+
+ pc = preempt_count();
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
+ sizeof(*entry), 0, pc);
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = instruction_pointer(regs);
+ entry->type = hw_breakpoint_type(hbp);
+ entry->addr = hw_breakpoint_addr(hbp);
+ strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+ ksym_collect_stats(hw_breakpoint_addr(hbp));
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+
+ trace_buffer_unlock_commit(buffer, event, 0, pc);
+}
+
+/* Valid access types are represented as
+ *
+ * rw- : Set Read/Write Access Breakpoint
+ * -w- : Set Write Access Breakpoint
+ * --- : Clear Breakpoints
+ * --x : Set Execution Break points (Not available yet)
+ *
+ */
+static int ksym_trace_get_access_type(char *str)
+{
+ int access = 0;
+
+ if (str[0] == 'r')
+ access |= HW_BREAKPOINT_R;
+
+ if (str[1] == 'w')
+ access |= HW_BREAKPOINT_W;
+
+ if (str[2] == 'x')
+ access |= HW_BREAKPOINT_X;
+
+ switch (access) {
+ case HW_BREAKPOINT_R:
+ case HW_BREAKPOINT_W:
+ case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+ return access;
+ default:
+ return -EINVAL;
+ }
+}
+
+/*
+ * There can be several possible malformed requests and we attempt to capture
+ * all of them. We enumerate some of the rules
+ * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
+ * i.e. multiple ':' symbols disallowed. Possible uses are of the form
+ * <module>:<ksym_name>:<op>.
+ * 2. No delimiter symbol ':' in the input string
+ * 3. Spurious operator symbols or symbols not in their respective positions
+ * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
+ * 5. Kernel symbol not a part of /proc/kallsyms
+ * 6. Duplicate requests
+ */
+static int parse_ksym_trace_str(char *input_string, char **ksymname,
+ unsigned long *addr)
+{
+ int ret;
+
+ *ksymname = strsep(&input_string, ":");
+ *addr = kallsyms_lookup_name(*ksymname);
+
+ /* Check for malformed request: (2), (1) and (5) */
+ if ((!input_string) ||
+ (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
+ (*addr == 0))
+ return -EINVAL;;
+
+ ret = ksym_trace_get_access_type(input_string);
+
+ return ret;
+}
+
+int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
+{
+ struct trace_ksym *entry;
+ int ret = -ENOMEM;
+
+ if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
+ printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
+ " new requests for tracing can be accepted now.\n",
+ KSYM_TRACER_MAX);
+ return -ENOSPC;
+ }
+
+ entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ hw_breakpoint_init(&entry->attr);
+
+ entry->attr.bp_type = op;
+ entry->attr.bp_addr = addr;
+ entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
+
+ ret = -EAGAIN;
+ entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
+ ksym_hbp_handler);
+
+ if (IS_ERR(entry->ksym_hbp)) {
+ ret = PTR_ERR(entry->ksym_hbp);
+ printk(KERN_INFO "ksym_tracer request failed. Try again"
+ " later!!\n");
+ goto err;
+ }
+
+ hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
+ ksym_filter_entry_count++;
+
+ return 0;
+
+err:
+ kfree(entry);
+
+ return ret;
+}
+
+static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct trace_ksym *entry;
+ struct hlist_node *node;
+ struct trace_seq *s;
+ ssize_t cnt = 0;
+ int ret;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+ trace_seq_init(s);
+
+ mutex_lock(&ksym_tracer_mutex);
+
+ hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+ ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr);
+ if (entry->attr.bp_type == HW_BREAKPOINT_R)
+ ret = trace_seq_puts(s, "r--\n");
+ else if (entry->attr.bp_type == HW_BREAKPOINT_W)
+ ret = trace_seq_puts(s, "-w-\n");
+ else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
+ ret = trace_seq_puts(s, "rw-\n");
+ WARN_ON_ONCE(!ret);
+ }
+
+ cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+
+ mutex_unlock(&ksym_tracer_mutex);
+
+ kfree(s);
+
+ return cnt;
+}
+
+static void __ksym_trace_reset(void)
+{
+ struct trace_ksym *entry;
+ struct hlist_node *node, *node1;
+
+ mutex_lock(&ksym_tracer_mutex);
+ hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
+ ksym_hlist) {
+ unregister_wide_hw_breakpoint(entry->ksym_hbp);
+ ksym_filter_entry_count--;
+ hlist_del_rcu(&(entry->ksym_hlist));
+ synchronize_rcu();
+ kfree(entry);
+ }
+ mutex_unlock(&ksym_tracer_mutex);
+}
+
+static ssize_t ksym_trace_filter_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ struct trace_ksym *entry;
+ struct hlist_node *node;
+ char *input_string, *ksymname = NULL;
+ unsigned long ksym_addr = 0;
+ int ret, op, changed = 0;
+
+ input_string = kzalloc(count + 1, GFP_KERNEL);
+ if (!input_string)
+ return -ENOMEM;
+
+ if (copy_from_user(input_string, buffer, count)) {
+ kfree(input_string);
+ return -EFAULT;
+ }
+ input_string[count] = '\0';
+
+ strstrip(input_string);
+
+ /*
+ * Clear all breakpoints if:
+ * 1: echo > ksym_trace_filter
+ * 2: echo 0 > ksym_trace_filter
+ * 3: echo "*:---" > ksym_trace_filter
+ */
+ if (!input_string[0] || !strcmp(input_string, "0") ||
+ !strcmp(input_string, "*:---")) {
+ __ksym_trace_reset();
+ kfree(input_string);
+ return count;
+ }
+
+ ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
+ if (ret < 0) {
+ kfree(input_string);
+ return ret;
+ }
+
+ mutex_lock(&ksym_tracer_mutex);
+
+ ret = -EINVAL;
+ hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+ if (entry->attr.bp_addr == ksym_addr) {
+ /* Check for malformed request: (6) */
+ if (entry->attr.bp_type != op)
+ changed = 1;
+ else
+ goto out;
+ break;
+ }
+ }
+ if (changed) {
+ unregister_wide_hw_breakpoint(entry->ksym_hbp);
+ entry->attr.bp_type = op;
+ ret = 0;
+ if (op > 0) {
+ entry->ksym_hbp =
+ register_wide_hw_breakpoint(&entry->attr,
+ ksym_hbp_handler);
+ if (IS_ERR(entry->ksym_hbp))
+ ret = PTR_ERR(entry->ksym_hbp);
+ else
+ goto out;
+ }
+ /* Error or "symbol:---" case: drop it */
+ ksym_filter_entry_count--;
+ hlist_del_rcu(&(entry->ksym_hlist));
+ synchronize_rcu();
+ kfree(entry);
+ goto out;
+ } else {
+ /* Check for malformed request: (4) */
+ if (op == 0)
+ goto out;
+ ret = process_new_ksym_entry(ksymname, op, ksym_addr);
+ }
+out:
+ mutex_unlock(&ksym_tracer_mutex);
+
+ kfree(input_string);
+
+ if (!ret)
+ ret = count;
+ return ret;
+}
+
+static const struct file_operations ksym_tracing_fops = {
+ .open = tracing_open_generic,
+ .read = ksym_trace_filter_read,
+ .write = ksym_trace_filter_write,
+};
+
+static void ksym_trace_reset(struct trace_array *tr)
+{
+ ksym_tracing_enabled = 0;
+ __ksym_trace_reset();
+}
+
+static int ksym_trace_init(struct trace_array *tr)
+{
+ int cpu, ret = 0;
+
+ for_each_online_cpu(cpu)
+ tracing_reset(tr, cpu);
+ ksym_tracing_enabled = 1;
+ ksym_trace_array = tr;
+
+ return ret;
+}
+
+static void ksym_trace_print_header(struct seq_file *m)
+{
+ seq_puts(m,
+ "# TASK-PID CPU# Symbol "
+ "Type Function\n");
+ seq_puts(m,
+ "# | | | "
+ " | |\n");
+}
+
+static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_seq *s = &iter->seq;
+ struct ksym_trace_entry *field;
+ char str[KSYM_SYMBOL_LEN];
+ int ret;
+
+ if (entry->type != TRACE_KSYM)
+ return TRACE_TYPE_UNHANDLED;
+
+ trace_assign_type(field, entry);
+
+ ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
+ entry->pid, iter->cpu, (char *)field->addr);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ switch (field->type) {
+ case HW_BREAKPOINT_R:
+ ret = trace_seq_printf(s, " R ");
+ break;
+ case HW_BREAKPOINT_W:
+ ret = trace_seq_printf(s, " W ");
+ break;
+ case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
+ ret = trace_seq_printf(s, " RW ");
+ break;
+ default:
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ sprint_symbol(str, field->ip);
+ ret = trace_seq_printf(s, "%s\n", str);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+struct tracer ksym_tracer __read_mostly =
+{
+ .name = "ksym_tracer",
+ .init = ksym_trace_init,
+ .reset = ksym_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_ksym,
+#endif
+ .print_header = ksym_trace_print_header,
+ .print_line = ksym_trace_output
+};
+
+__init static int init_ksym_trace(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+ ksym_filter_entry_count = 0;
+
+ entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
+ NULL, &ksym_tracing_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'ksym_trace_filter' file\n");
+
+ return register_tracer(&ksym_tracer);
+}
+device_initcall(init_ksym_trace);
+
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+static int ksym_tracer_stat_headers(struct seq_file *m)
+{
+ seq_puts(m, " Access Type ");
+ seq_puts(m, " Symbol Counter\n");
+ seq_puts(m, " ----------- ");
+ seq_puts(m, " ------ -------\n");
+ return 0;
+}
+
+static int ksym_tracer_stat_show(struct seq_file *m, void *v)
+{
+ struct hlist_node *stat = v;
+ struct trace_ksym *entry;
+ int access_type = 0;
+ char fn_name[KSYM_NAME_LEN];
+
+ entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
+
+ access_type = entry->attr.bp_type;
+
+ switch (access_type) {
+ case HW_BREAKPOINT_R:
+ seq_puts(m, " R ");
+ break;
+ case HW_BREAKPOINT_W:
+ seq_puts(m, " W ");
+ break;
+ case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
+ seq_puts(m, " RW ");
+ break;
+ default:
+ seq_puts(m, " NA ");
+ }
+
+ if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
+ seq_printf(m, " %-36s", fn_name);
+ else
+ seq_printf(m, " %-36s", "<NA>");
+ seq_printf(m, " %15lu\n", entry->counter);
+
+ return 0;
+}
+
+static void *ksym_tracer_stat_start(struct tracer_stat *trace)
+{
+ return ksym_filter_head.first;
+}
+
+static void *
+ksym_tracer_stat_next(void *v, int idx)
+{
+ struct hlist_node *stat = v;
+
+ return stat->next;
+}
+
+static struct tracer_stat ksym_tracer_stats = {
+ .name = "ksym_tracer",
+ .stat_start = ksym_tracer_stat_start,
+ .stat_next = ksym_tracer_stat_next,
+ .stat_headers = ksym_tracer_stat_headers,
+ .stat_show = ksym_tracer_stat_show
+};
+
+__init static int ksym_tracer_stat_init(void)
+{
+ int ret;
+
+ ret = register_stat_tracer(&ksym_tracer_stats);
+ if (ret) {
+ printk(KERN_WARNING "Warning: could not register "
+ "ksym tracer stats\n");
+ return 1;
+ }
+
+ return 0;
+}
+fs_initcall(ksym_tracer_stat_init);
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ed17565826b..b6c12c6a1bc 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -69,6 +69,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
* @s: trace sequence descriptor
* @fmt: printf format string
*
+ * It returns 0 if the trace oversizes the buffer's free
+ * space, 1 otherwise.
+ *
* The tracer may use either sequence operations or its own
* copy to user routines. To simplify formating of a trace
* trace_seq_printf is used to store strings into a special
@@ -95,7 +98,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
s->len += ret;
- return len;
+ return 1;
}
EXPORT_SYMBOL_GPL(trace_seq_printf);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index d2cdbabb4ea..dc98309e839 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
case TRACE_GRAPH_ENT:
case TRACE_GRAPH_RET:
case TRACE_HW_BRANCHES:
+ case TRACE_KSYM:
return 1;
}
return 0;
@@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
return ret;
}
#endif /* CONFIG_HW_BRANCH_TRACER */
+
+#ifdef CONFIG_KSYM_TRACER
+static int ksym_selftest_dummy;
+
+int
+trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
+{
+ unsigned long count;
+ int ret;
+
+ /* start the tracing */
+ ret = tracer_init(trace, tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ return ret;
+ }
+
+ ksym_selftest_dummy = 0;
+ /* Register the read-write tracing request */
+
+ ret = process_new_ksym_entry("ksym_selftest_dummy",
+ HW_BREAKPOINT_R | HW_BREAKPOINT_W,
+ (unsigned long)(&ksym_selftest_dummy));
+
+ if (ret < 0) {
+ printk(KERN_CONT "ksym_trace read-write startup test failed\n");
+ goto ret_path;
+ }
+ /* Perform a read and a write operation over the dummy variable to
+ * trigger the tracer
+ */
+ if (ksym_selftest_dummy == 0)
+ ksym_selftest_dummy++;
+
+ /* stop the tracing. */
+ tracing_stop();
+ /* check the trace buffer */
+ ret = trace_test_buffer(tr, &count);
+ trace->reset(tr);
+ tracing_start();
+
+ /* read & write operations - one each is performed on the dummy variable
+ * triggering two entries in the trace buffer
+ */
+ if (!ret && count != 2) {
+ printk(KERN_CONT "Ksym tracer startup test failed");
+ ret = -1;
+ }
+
+ret_path:
+ return ret;
+}
+#endif /* CONFIG_KSYM_TRACER */
+
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index d99abc427c3..57501d90096 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -14,6 +14,43 @@ static int sys_refcount_exit;
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
+extern unsigned long __start_syscalls_metadata[];
+extern unsigned long __stop_syscalls_metadata[];
+
+static struct syscall_metadata **syscalls_metadata;
+
+static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
+{
+ struct syscall_metadata *start;
+ struct syscall_metadata *stop;
+ char str[KSYM_SYMBOL_LEN];
+
+
+ start = (struct syscall_metadata *)__start_syscalls_metadata;
+ stop = (struct syscall_metadata *)__stop_syscalls_metadata;
+ kallsyms_lookup(syscall, NULL, NULL, NULL, str);
+
+ for ( ; start < stop; start++) {
+ /*
+ * Only compare after the "sys" prefix. Archs that use
+ * syscall wrappers may have syscalls symbols aliases prefixed
+ * with "SyS" instead of "sys", leading to an unwanted
+ * mismatch.
+ */
+ if (start->name && !strcmp(start->name + 3, str + 3))
+ return start;
+ }
+ return NULL;
+}
+
+static struct syscall_metadata *syscall_nr_to_meta(int nr)
+{
+ if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
+ return NULL;
+
+ return syscalls_metadata[nr];
+}
+
enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags)
{
@@ -30,7 +67,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
if (!entry)
goto end;
- if (entry->enter_id != ent->type) {
+ if (entry->enter_event->id != ent->type) {
WARN_ON_ONCE(1);
goto end;
}
@@ -85,7 +122,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
- if (entry->exit_id != ent->type) {
+ if (entry->exit_event->id != ent->type) {
WARN_ON_ONCE(1);
return TRACE_TYPE_UNHANDLED;
}
@@ -109,18 +146,11 @@ extern char *__bad_type_size(void);
int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
{
int i;
- int nr;
int ret;
- struct syscall_metadata *entry;
+ struct syscall_metadata *entry = call->data;
struct syscall_trace_enter trace;
int offset = offsetof(struct syscall_trace_enter, args);
- nr = syscall_name_to_nr(call->data);
- entry = syscall_nr_to_meta(nr);
-
- if (!entry)
- return 0;
-
ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
"\tsigned:%u;\n",
SYSCALL_FIELD(int, nr));
@@ -182,22 +212,19 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
int syscall_enter_define_fields(struct ftrace_event_call *call)
{
struct syscall_trace_enter trace;
- struct syscall_metadata *meta;
+ struct syscall_metadata *meta = call->data;
int ret;
- int nr;
int i;
int offset = offsetof(typeof(trace), args);
- nr = syscall_name_to_nr(call->data);
- meta = syscall_nr_to_meta(nr);
-
- if (!meta)
- return 0;
-
ret = trace_define_common_fields(call);
if (ret)
return ret;
+ ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+ if (ret)
+ return ret;
+
for (i = 0; i < meta->nb_args; i++) {
ret = trace_define_field(call, meta->types[i],
meta->args[i], offset,
@@ -218,6 +245,10 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
if (ret)
return ret;
+ ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+ if (ret)
+ return ret;
+
ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
FILTER_OTHER);
@@ -245,8 +276,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
- event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id,
- size, 0, 0);
+ event = trace_current_buffer_lock_reserve(&buffer,
+ sys_data->enter_event->id, size, 0, 0);
if (!event)
return;
@@ -277,8 +308,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
if (!sys_data)
return;
- event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id,
- sizeof(*entry), 0, 0);
+ event = trace_current_buffer_lock_reserve(&buffer,
+ sys_data->exit_event->id, sizeof(*entry), 0, 0);
if (!event)
return;
@@ -291,14 +322,12 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
trace_current_buffer_unlock_commit(buffer, event, 0, 0);
}
-int reg_event_syscall_enter(void *ptr)
+int reg_event_syscall_enter(struct ftrace_event_call *call)
{
int ret = 0;
int num;
- char *name;
- name = (char *)ptr;
- num = syscall_name_to_nr(name);
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
if (num < 0 || num >= NR_syscalls)
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
@@ -315,13 +344,11 @@ int reg_event_syscall_enter(void *ptr)
return ret;
}
-void unreg_event_syscall_enter(void *ptr)
+void unreg_event_syscall_enter(struct ftrace_event_call *call)
{
int num;
- char *name;
- name = (char *)ptr;
- num = syscall_name_to_nr(name);
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
if (num < 0 || num >= NR_syscalls)
return;
mutex_lock(&syscall_trace_lock);
@@ -332,14 +359,12 @@ void unreg_event_syscall_enter(void *ptr)
mutex_unlock(&syscall_trace_lock);
}
-int reg_event_syscall_exit(void *ptr)
+int reg_event_syscall_exit(struct ftrace_event_call *call)
{
int ret = 0;
int num;
- char *name;
- name = (char *)ptr;
- num = syscall_name_to_nr(name);
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
if (num < 0 || num >= NR_syscalls)
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
@@ -356,13 +381,11 @@ int reg_event_syscall_exit(void *ptr)
return ret;
}
-void unreg_event_syscall_exit(void *ptr)
+void unreg_event_syscall_exit(struct ftrace_event_call *call)
{
int num;
- char *name;
- name = (char *)ptr;
- num = syscall_name_to_nr(name);
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
if (num < 0 || num >= NR_syscalls)
return;
mutex_lock(&syscall_trace_lock);
@@ -373,13 +396,44 @@ void unreg_event_syscall_exit(void *ptr)
mutex_unlock(&syscall_trace_lock);
}
-struct trace_event event_syscall_enter = {
- .trace = print_syscall_enter,
-};
+int init_syscall_trace(struct ftrace_event_call *call)
+{
+ int id;
+
+ id = register_ftrace_event(call->event);
+ if (!id)
+ return -ENODEV;
+ call->id = id;
+ INIT_LIST_HEAD(&call->fields);
+ return 0;
+}
+
+int __init init_ftrace_syscalls(void)
+{
+ struct syscall_metadata *meta;
+ unsigned long addr;
+ int i;
+
+ syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
+ NR_syscalls, GFP_KERNEL);
+ if (!syscalls_metadata) {
+ WARN_ON(1);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < NR_syscalls; i++) {
+ addr = arch_syscall_addr(i);
+ meta = find_syscall_meta(addr);
+ if (!meta)
+ continue;
+
+ meta->syscall_nr = i;
+ syscalls_metadata[i] = meta;
+ }
-struct trace_event event_syscall_exit = {
- .trace = print_syscall_exit,
-};
+ return 0;
+}
+core_initcall(init_ftrace_syscalls);
#ifdef CONFIG_EVENT_PROFILE
@@ -393,8 +447,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
unsigned long flags;
+ char *trace_buf;
char *raw_data;
int syscall_nr;
+ int rctx;
int size;
int cpu;
@@ -418,41 +474,42 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
/* Protect the per cpu buffer, begin the rcu read side */
local_irq_save(flags);
+ rctx = perf_swevent_get_recursion_context();
+ if (rctx < 0)
+ goto end_recursion;
+
cpu = smp_processor_id();
- if (in_nmi())
- raw_data = rcu_dereference(trace_profile_buf_nmi);
- else
- raw_data = rcu_dereference(trace_profile_buf);
+ trace_buf = rcu_dereference(perf_trace_buf);
- if (!raw_data)
+ if (!trace_buf)
goto end;
- raw_data = per_cpu_ptr(raw_data, cpu);
+ raw_data = per_cpu_ptr(trace_buf, cpu);
/* zero the dead bytes from align to not leak stack to user */
*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
rec = (struct syscall_trace_enter *) raw_data;
tracing_generic_entry_update(&rec->ent, 0, 0);
- rec->ent.type = sys_data->enter_id;
+ rec->ent.type = sys_data->enter_event->id;
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
- perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
+ perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
end:
+ perf_swevent_put_recursion_context(rctx);
+end_recursion:
local_irq_restore(flags);
}
-int reg_prof_syscall_enter(char *name)
+int prof_sysenter_enable(struct ftrace_event_call *call)
{
int ret = 0;
int num;
- num = syscall_name_to_nr(name);
- if (num < 0 || num >= NR_syscalls)
- return -ENOSYS;
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
mutex_lock(&syscall_trace_lock);
if (!sys_prof_refcount_enter)
@@ -468,13 +525,11 @@ int reg_prof_syscall_enter(char *name)
return ret;
}
-void unreg_prof_syscall_enter(char *name)
+void prof_sysenter_disable(struct ftrace_event_call *call)
{
int num;
- num = syscall_name_to_nr(name);
- if (num < 0 || num >= NR_syscalls)
- return;
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
mutex_lock(&syscall_trace_lock);
sys_prof_refcount_enter--;
@@ -490,7 +545,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
struct syscall_trace_exit *rec;
unsigned long flags;
int syscall_nr;
+ char *trace_buf;
char *raw_data;
+ int rctx;
int size;
int cpu;
@@ -516,17 +573,19 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
/* Protect the per cpu buffer, begin the rcu read side */
local_irq_save(flags);
+
+ rctx = perf_swevent_get_recursion_context();
+ if (rctx < 0)
+ goto end_recursion;
+
cpu = smp_processor_id();
- if (in_nmi())
- raw_data = rcu_dereference(trace_profile_buf_nmi);
- else
- raw_data = rcu_dereference(trace_profile_buf);
+ trace_buf = rcu_dereference(perf_trace_buf);
- if (!raw_data)
+ if (!trace_buf)
goto end;
- raw_data = per_cpu_ptr(raw_data, cpu);
+ raw_data = per_cpu_ptr(trace_buf, cpu);
/* zero the dead bytes from align to not leak stack to user */
*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -534,24 +593,24 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
rec = (struct syscall_trace_exit *)raw_data;
tracing_generic_entry_update(&rec->ent, 0, 0);
- rec->ent.type = sys_data->exit_id;
+ rec->ent.type = sys_data->exit_event->id;
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
- perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
+ perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
end:
+ perf_swevent_put_recursion_context(rctx);
+end_recursion:
local_irq_restore(flags);
}
-int reg_prof_syscall_exit(char *name)
+int prof_sysexit_enable(struct ftrace_event_call *call)
{
int ret = 0;
int num;
- num = syscall_name_to_nr(name);
- if (num < 0 || num >= NR_syscalls)
- return -ENOSYS;
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
mutex_lock(&syscall_trace_lock);
if (!sys_prof_refcount_exit)
@@ -567,13 +626,11 @@ int reg_prof_syscall_exit(char *name)
return ret;
}
-void unreg_prof_syscall_exit(char *name)
+void prof_sysexit_disable(struct ftrace_event_call *call)
{
int num;
- num = syscall_name_to_nr(name);
- if (num < 0 || num >= NR_syscalls)
- return;
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
mutex_lock(&syscall_trace_lock);
sys_prof_refcount_exit--;
diff --git a/kernel/user.c b/kernel/user.c
index 2c000e7132a..46d0165ca70 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -330,9 +330,9 @@ done:
*/
static void free_user(struct user_struct *up, unsigned long flags)
{
- spin_unlock_irqrestore(&uidhash_lock, flags);
INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
+ spin_unlock_irqrestore(&uidhash_lock, flags);
}
#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 47cdd7e76f2..12328147132 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -685,21 +685,38 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
int schedule_on_each_cpu(work_func_t func)
{
int cpu;
+ int orig = -1;
struct work_struct *works;
works = alloc_percpu(struct work_struct);
if (!works)
return -ENOMEM;
+ /*
+ * when running in keventd don't schedule a work item on itself.
+ * Can just call directly because the work queue is already bound.
+ * This also is faster.
+ * Make this a generic parameter for other workqueues?
+ */
+ if (current_is_keventd()) {
+ orig = raw_smp_processor_id();
+ INIT_WORK(per_cpu_ptr(works, orig), func);
+ func(per_cpu_ptr(works, orig));
+ }
+
get_online_cpus();
for_each_online_cpu(cpu) {
struct work_struct *work = per_cpu_ptr(works, cpu);
+ if (cpu == orig)
+ continue;
INIT_WORK(work, func);
schedule_work_on(cpu, work);
}
- for_each_online_cpu(cpu)
- flush_work(per_cpu_ptr(works, cpu));
+ for_each_online_cpu(cpu) {
+ if (cpu != orig)
+ flush_work(per_cpu_ptr(works, cpu));
+ }
put_online_cpus();
free_percpu(works);
return 0;